main_testing2/danube.scala
changeset 403 ffce7b61b446
parent 384 6e1237691307
equal deleted inserted replaced
402:de59aa20a1dc 403:ffce7b61b446
     1 // Core Part about Movie Recommendations
     1 // Core Part about Movie Recommendations 
     2 // at Danube.co.uk
     2 // at Danube.co.uk
     3 //===========================================
     3 //========================================
     4 
     4 
     5 object CW7b {
     5 
       
     6 object M2 { // for purposes of generating a jar
     6 
     7 
     7 import io.Source
     8 import io.Source
     8 import scala.util._
     9 import scala.util._
     9 
    10 
    10 
    11 
    11 // (1) Implement the function get_csv_url which takes an url-string
    12 // (1) Implement the function get_csv_url which takes an url-string
    12 //     as argument and requests the corresponding file. The two urls
    13 //     as argument and requests the corresponding file. The two urls
    13 //     of interest are ratings_url and movies_url, which correspond 
    14 //     of interest are ratings_url and movies_url, which correspond 
    14 //     to CSV-files.
    15 //     to CSV-files.
    15 //
    16 //     The function should return the CSV file appropriately broken
    16 //     The function should ReTurn the CSV-file appropriately broken
       
    17 //     up into lines, and the first line should be dropped (that is without
    17 //     up into lines, and the first line should be dropped (that is without
    18 //     the header of the CSV-file). The result is a list of strings (lines
    18 //     the header of the CSV file). The result is a list of strings (lines
    19 //     in the file).
    19 //     in the file).
    20 
    20 
    21 def get_csv_url(url: String) : List[String] = {
    21 def get_csv_url(url: String) : List[String] = {
    22   val site = Source.fromURL(url, "ISO-8859-1")
    22   val csv = Source.fromURL(url)("ISO-8859-1")
    23   val site_string = site.mkString
    23   csv.mkString.split("\n").toList.drop(1)
    24   val output = (site_string.split("\n")).toList
       
    25   output.tail
       
    26 }
    24 }
    27 
    25 
    28   // get_csv_url("https://nms.kcl.ac.uk/christian.urban/ratings.csv")
    26 val ratings_url = """https://nms.kcl.ac.uk/christian.urban/ratings.csv"""
       
    27 val movies_url = """https://nms.kcl.ac.uk/christian.urban/movies.csv"""
    29 
    28 
    30 //val ratings_url = """https://nms.kcl.ac.uk/christian.urban/ratings.csv"""
    29 // test cases
    31 //val movies_url = """https://nms.kcl.ac.uk/christian.urban/movies.csv"""
       
    32 
    30 
    33 // testcases
    31 //val ratings = get_csv_url(ratings_url)
    34 //-----------
       
    35 //:
       
    36 //val movies = get_csv_url(movies_url)
    32 //val movies = get_csv_url(movies_url)
    37   // val ratings = get_csv_url(ratings_url)
       
    38 
    33 
    39 //ratings.length  // 87313
    34 //ratings.length  // 87313
    40 //movies.length   // 9742
    35 //movies.length   // 9742
    41 
    36 
    42 
    37 // (2) Implement two functions that process the CSV files. The ratings
    43 // (2) Implement two functions that process the CSV-files from (1). The ratings
    38 //     function filters out all ratings below 4 and returns a list of 
    44 //     function filters out all ratings below 4 and ReTurns a list of 
    39 //     (userID, movieID) pairs. The movies function just returns a list 
    45 //     (userID, movieID) pairs. The movies function just ReTurns a list 
    40 //     of (movieId, title) pairs.
    46 //     of (movieID, title) pairs. Note the input to these functions, that is
       
    47 //     the argument lines, will be the output of the function get_csv_url.
       
    48 
    41 
    49 
    42 
    50 def process_ratings(lines: List[String]) : List[(String, String)] = {
    43 def process_ratings(lines: List[String]) : List[(String, String)] = {
    51   val filter = lines.filter(_.last.asDigit >=4)
    44   for (cols <- lines.map(_.split(",").toList); 
    52   val output = (for(i <- 0 until filter.length) yield ((filter(i).split(",").toList)(0), (filter(i).split(",").toList)(1))).toList
    45        if (cols(2).toInt >= 4)) yield (cols(0), cols(1))  
    53   output
       
    54 }
    46 }
    55 
    47 
    56 def process_movies(lines: List[String]) : List[(String, String)] = {
    48 def process_movies(lines: List[String]) : List[(String, String)] = {
    57   val output = (for(i <- 0 until lines.length) yield ((lines(i).split(",").toList)(0), (lines(i).split(",").toList)(1))).toList
       
    58   output
       
    59 }
       
    60 
       
    61 
       
    62 
       
    63 def process_ratings2(lines: List[String]) : List[(String, String)] = {
       
    64   for (cols <- lines.map(_.split(",").toList); 
       
    65        if (cols(2).toFloat >= 4)) yield (cols(0), cols(1))  
       
    66 }
       
    67 
       
    68 def process_movies2(lines: List[String]) : List[(String, String)] = {
       
    69   for (cols <- lines.map(_.split(",").toList)) yield (cols(0), cols(1))  
    49   for (cols <- lines.map(_.split(",").toList)) yield (cols(0), cols(1))  
    70 }
    50 }
    71 
    51 
    72 // testcases
    52 // test cases
    73 //-----------
    53 
    74 //val good_ratings = process_ratings(ratings)
    54 //val good_ratings = process_ratings(ratings)
    75 //val movie_names = process_movies(movies)
    55 //val movie_names = process_movies(movies)
    76 
    56 
    77 //good_ratings.length   //48580
    57 //good_ratings.length   //48580
    78 //movie_names.length    // 9742
    58 //movie_names.length    // 9742
    79 
    59 
    80 
    60 
    81 
    61 // (3) Implement a grouping function that calulates a map
    82 
    62 //     containing the userIds and all the corresponding recommendations 
    83 // (3) Implement a grouping function that calculates a Map
    63 //     (list of movieIds). This  should be implemented in a tail
    84 //     containing the userIDs and all the corresponding recommendations 
    64 //     recursive fashion, using a map m as accumulator. This map
    85 //     (list of movieIDs). This  should be implemented in a tail
    65 //     is set to Map() at the beginning of the claculation.
    86 //     recursive fashion, using a Map m as accumulator. This Map m
       
    87 //     is set to Map() at the beginning of the calculation.
       
    88 
       
    89 val ratings_url = """https://nms.kcl.ac.uk/christian.urban/ratings.csv"""
       
    90 val ratings = get_csv_url(ratings_url)
       
    91 val good_ratings = process_ratings(ratings)
       
    92 val v515 = good_ratings.filter(_._1 == "515")
       
    93 val v515_2 = v515.map(_._2)
       
    94 
    66 
    95 def groupById(ratings: List[(String, String)], 
    67 def groupById(ratings: List[(String, String)], 
    96               m: Map[String, List[String]]) : Map[String, List[String]] = {
       
    97 val users = (for((k,v) <- ratings) yield k).distinct
       
    98 val movie_ids = (for(i <- 1 to users.length) yield
       
    99   (for ((k,v) <- ratings if(i.toString == k)) yield v).toList).toList
       
   100   val out_map = (users zip movie_ids).toMap
       
   101 out_map
       
   102 }
       
   103 
       
   104 def groupById2(ratings: List[(String, String)], 
       
   105               m: Map[String, List[String]]) : Map[String, List[String]] = ratings match {
    68               m: Map[String, List[String]]) : Map[String, List[String]] = ratings match {
   106   case Nil => m
    69   case Nil => m
   107   case (id, mov) :: rest => {
    70   case (id, mov) :: rest => {
   108     val old_ratings = m.getOrElse (id, Nil)
    71     val old_ratings = m.getOrElse (id, Nil)
   109     val new_ratings = m + (id -> (mov :: old_ratings))
    72     val new_ratings = m + (id -> (mov :: old_ratings))
   110     groupById2(rest, new_ratings)
    73     groupById(rest, new_ratings)
   111   }
    74   }
   112 }
    75 }
   113 
    76 
   114 val ls0_urban = 
    77 // test cases
   115   List(("1", "a"), ("1", "c"), ("1", "c"))
       
   116 
       
   117 groupById(ls0_urban, Map())
       
   118 groupById2(ls0_urban, Map())
       
   119 
       
   120 val ls00_urban = 
       
   121   List(("3", "a"), ("3", "c"), ("3", "c"))
       
   122 
       
   123 groupById(ls00_urban, Map())
       
   124 groupById2(ls00_urban, Map())
       
   125 
       
   126 groupById(good_ratings, Map()).getOrElse("515", Nil)
       
   127 groupById2(good_ratings, Map()).getOrElse("515", Nil)
       
   128 
       
   129 val ls1_urban = 
       
   130   List(("1", "a"), ("2", "a"), 
       
   131        ("1", "c"), ("2", "a"), ("1", "c"))
       
   132 
       
   133 groupById(ls1_urban, Map())
       
   134 groupById2(ls1_urban, Map())
       
   135 
       
   136 val ls2_urban = 
       
   137   List(("1", "a"), ("1", "b"), ("2", "x"), 
       
   138        ("3", "a"), ("2", "y"), ("3", "c"))
       
   139 
       
   140 groupById(ls2_urban, Map())
       
   141 groupById2(ls2_urban, Map())
       
   142 
       
   143 val ls3_urban = (1 to 1000 by 10).map(_.toString).toList
       
   144 val ls4_urban = ls3_urban zip ls3_urban.tail
       
   145 val ls5_urban = ls4_urban ::: ls4_urban.reverse
       
   146 
       
   147 groupById(ls5_urban, Map()) == groupById2(ls5_urban, Map())
       
   148 
       
   149 groupById(ls5_urban, Map())
       
   150 groupById2(ls5_urban, Map())
       
   151 
       
   152 groupById(v515, Map())
       
   153 groupById2(v515, Map())
       
   154 
       
   155 groupById(v515.take(1), Map())
       
   156 groupById2(v515.take(2), Map())
       
   157 
       
   158 // testcases
       
   159 //-----------
       
   160 //val ratings_map = groupById(good_ratings, Map())
    78 //val ratings_map = groupById(good_ratings, Map())
   161 //val movies_map = movie_names.toMap
    79 //val movies_map = movie_names.toMap
   162 
    80 
   163 //ratings_map.get("414").get.map(movies_map.get(_)).length
    81 //ratings_map.get("414").get.map(movies_map.get(_)) // most prolific recommender with 1227 positive ratings
   164 //    => most prolific recommender with 1227 positive ratings
    82 //ratings_map.get("474").get.map(movies_map.get(_)) // second-most prolific recommender with 787 positive ratings
   165 
    83 //ratings_map.get("214").get.map(movies_map.get(_)) // least prolific recommender with only 1 positive rating
   166 //ratings_map.get("475").get.map(movies_map.get(_)).length
       
   167 //    => second-most prolific recommender with 787 positive ratings
       
   168 
       
   169 //ratings_map.get("214").get.map(movies_map.get(_)).length 
       
   170 //    => least prolific recommender with only 1 positive rating
       
   171 
    84 
   172 
    85 
   173 // (4) Implement a function that takes a ratings map and a movie_name as argument.
       
   174 //     The function calculates all suggestions containing
       
   175 //     the movie in its recommendations. It ReTurns a list of all these
       
   176 //     recommendations (each of them is a list and needs to have the movie deleted, 
       
   177 //     otherwise it might happen we recommend the same movie).
       
   178 
    86 
       
    87 //(4) Implement a function that takes a ratings map and a movie_name as argument.
       
    88 // The function calculates all suggestions containing
       
    89 // the movie mov in its recommendations. It returns a list of all these
       
    90 // recommendations (each of them is a list and needs to have mov deleted, 
       
    91 // otherwise it might happen we recommend the same movie).
   179 
    92 
   180 def favourites(m: Map[String, List[String]], mov: String) : List[List[String]] = {
    93 def favourites(m: Map[String, List[String]], mov: String) : List[List[String]] = 
   181  (for((k,v) <- m if (v.contains(mov))) yield v.filter(_!=mov).toList).toList
       
   182 }
       
   183 
       
   184 def favourites2(m: Map[String, List[String]], mov: String) : List[List[String]] = 
       
   185   (for (id <- m.keys.toList;
    94   (for (id <- m.keys.toList;
   186         if m(id).contains(mov)) yield m(id).filter(_ != mov))
    95         if m(id).contains(mov)) yield m(id).filter(_ != mov))
   187 
    96 
   188 
    97 
   189 // testcases
    98 
   190 //-----------
    99 // test cases
   191 // movie ID "912" -> Casablanca (1942)
   100 // movie ID "912" -> Casablanca (1942)
   192 //          "858" -> Godfather
   101 //          "858" -> Godfather
   193 //          "260" -> Star Wars: Episode IV - A New Hope (1977)
   102 //          "260" -> Star Wars: Episode IV - A New Hope (1977)
   194 
   103 
   195 //favourites(ratings_map, "912").length  // => 80
   104 //favourites(ratings_map, "912").length  // => 80
   197 // That means there are 80 users that recommend the movie with ID 912.
   106 // That means there are 80 users that recommend the movie with ID 912.
   198 // Of these 80  users, 55 gave a good rating to movie 858 and
   107 // Of these 80  users, 55 gave a good rating to movie 858 and
   199 // 52 a good rating to movies 260, 318, 593.
   108 // 52 a good rating to movies 260, 318, 593.
   200 
   109 
   201 
   110 
   202 
       
   203 // (5) Implement a suggestions function which takes a rating
   111 // (5) Implement a suggestions function which takes a rating
   204 //     map and a movie_name as arguments. It calculates all the recommended
   112 // map and a movie_name as arguments. It calculates all the recommended
   205 //     movies sorted according to the most frequently suggested movie(s) first.
   113 // movies sorted according to the most frequently suggested movie(s) first.
   206 
       
   207 def suggestions(recs: Map[String, List[String]], 
   114 def suggestions(recs: Map[String, List[String]], 
   208                 mov_name: String) : List[String] = {
       
   209   val flat = favourites(recs, mov_name).flatten.groupMapReduce(identity)(_ => 1)(_ + _)
       
   210   val sorted = flat.toList.sortWith(_._2 > _._2).map(_._1)
       
   211   sorted
       
   212 }
       
   213 
       
   214 
       
   215 def mapValues[S, T, R](m: Map[S, T], f: T => R) =
       
   216   m.map { case (x, y) => (x, f(y)) }
       
   217 
       
   218 def suggestions2(recs: Map[String, List[String]], 
       
   219                     mov_name: String) : List[String] = {
   115                     mov_name: String) : List[String] = {
   220   val favs = favourites(recs, mov_name).flatten
   116   val favs = favourites(recs, mov_name).flatten
   221   val favs_counted = mapValues(favs.groupBy(identity), (v:List[String]) => v.size).toList
   117   val favs_counted = favs.groupBy(identity).view.mapValues(_.size).toList
   222   val favs_sorted = favs_counted.sortBy(_._2).reverse
   118   val favs_sorted = favs_counted.sortBy(_._2).reverse
   223   favs_sorted.map(_._1)
   119   favs_sorted.map(_._1)
   224 }
   120 }
   225 
   121 
   226 // testcases
   122 // test cases
   227 //-----------
       
   228 
   123 
   229 //suggestions(ratings_map, "912")
   124 //suggestions(ratings_map, "912")
   230 //suggestions(ratings_map, "912").length  
   125 //suggestions(ratings_map, "912").length  
   231 // => 4110 suggestions with List(858, 260, 318, 593, ...)
   126 // => 4110 suggestions with List(858, 260, 318, 593, ...)
   232 //    being the most frequently suggested movies
   127 //    being the most frequently suggested movies
   233 
   128 
   234 
   129 // (6) Implement recommendations functions which generates at most
   235 
   130 // *two* of the most frequently suggested movies. It Returns the 
   236 // (6) Implement a recommendations function which generates at most
   131 // actual movie names, not the movieIDs.
   237 //     *two* of the most frequently suggested movies. It ReTurns the 
       
   238 //     actual movie names, not the movieIDs.
       
   239 
       
   240 
   132 
   241 def recommendations(recs: Map[String, List[String]],
   133 def recommendations(recs: Map[String, List[String]],
   242                     movs: Map[String, String],
   134                    movs: Map[String, String],
   243                     mov_name: String) : List[String] = {
   135                    mov_name: String) : List[String] =
   244   val sugg = suggestions(recs, mov_name)
   136   suggestions(recs, mov_name).take(2).map(movs.get(_).get)                 
   245   val movies = (for (i <- 0 until 2 if (i < sugg.length)) yield movs(sugg(i))).toList
       
   246   movies
       
   247 }
       
   248 
       
   249 
   137 
   250 
   138 
   251 // testcases
   139 // testcases
   252 //-----------
   140 
   253 // recommendations(ratings_map, movies_map, "912")
   141 // recommendations(ratings_map, movies_map, "912")
   254 //   => List(Godfather, Star Wars: Episode IV - A NewHope (1977))
   142 //   => List(Godfather, Star Wars: Episode IV - A NewHope (1977))
   255 
   143 
   256 //recommendations(ratings_map, movies_map, "260")
   144 //recommendations(ratings_map, movies_map, "260")
   257 //   => List(Star Wars: Episode V - The Empire Strikes Back (1980), 
   145 //   => List(Star Wars: Episode V - The Empire Strikes Back (1980), 
   268 
   156 
   269 // recommendations(ratings_map, movies_map, "4")
   157 // recommendations(ratings_map, movies_map, "4")
   270 //   => Nil  (there are three ratings for this movie in ratings.csv but they are not positive)     
   158 //   => Nil  (there are three ratings for this movie in ratings.csv but they are not positive)     
   271 
   159 
   272 
   160 
   273 
       
   274 // (7) Calculate the recommendations for all movies according to
       
   275 // what the recommendations function in (6) produces (this
       
   276 // can take a few seconds). Put all recommendations into a list 
       
   277 // (of strings) and count how often the strings occur in
       
   278 // this list. This produces a list of string-int pairs,
       
   279 // where the first component is the movie name and the second
       
   280 // is the number of how many times the movie was recommended. 
       
   281 // Sort all the pairs according to the number
       
   282 // of times they were recommended (most recommended movie name 
       
   283 // first).
       
   284 
       
   285 def most_recommended(recs: Map[String, List[String]],
       
   286                      movs: Map[String, String]) : List[(String, Int)] = {
       
   287   val movies = (((for((k,v) <- movs) yield recommendations(recs, movs, k)).toList).flatten).groupMapReduce(identity)(_ => 1)(_ + _)
       
   288   val sorted = movies.toList.sortWith(_._2 > _._2)
       
   289   sorted
       
   290 }
   161 }
   291 
       
   292 // testcase
       
   293 //
       
   294 //most_recommended(ratings_map, movies_map).take(3)
       
   295 // =>
       
   296 // List((Matrix,698), 
       
   297 //      (Star Wars: Episode IV - A New Hope (1977),402), 
       
   298 //      (Jerry Maguire (1996),382))
       
   299 
       
   300 
       
   301 
       
   302 }