diff -r 7a5ad01a85b5 -r 5616b45d656f main_testing2/danube.scala --- a/main_testing2/danube.scala Sat Nov 28 15:58:36 2020 +0000 +++ b/main_testing2/danube.scala Mon Nov 30 00:06:15 2020 +0000 @@ -1,17 +1,19 @@ // Core Part about Movie Recommendations // at Danube.co.uk -//=========================================== +//======================================== + + +object CW7b { // for purposes of generating a jar import io.Source import scala.util._ -object CW7b { // for purposes of generating a jar // (1) Implement the function get_csv_url which takes an url-string // as argument and requests the corresponding file. The two urls // of interest are ratings_url and movies_url, which correspond // to CSV-files. -// The function should ReTurn the CSV file appropriately broken +// The function should return the CSV file appropriately broken // up into lines, and the first line should be dropped (that is without // the header of the CSV file). The result is a list of strings (lines // in the file). @@ -33,14 +35,19 @@ //movies.length // 9742 // (2) Implement two functions that process the CSV files. The ratings -// function filters out all ratings below 4 and ReTurns a list of -// (userID, movieID) pairs. The movies function just ReTurns a list +// function filters out all ratings below 4 and returns a list of +// (userID, movieID) pairs. The movies function just returns a list // of (movieId, title) pairs. +//def process_ratings(lines: List[String]) : List[(String, String)] = { +// for (cols <- lines.map(_.split(",").toList); +// if (cols(2).toFloat >= 4)) yield (cols(0), cols(1)) +//} + def process_ratings(lines: List[String]) : List[(String, String)] = { for (cols <- lines.map(_.split(",").toList); - if (cols(2).toFloat >= 4)) yield (cols(0), cols(1)) + if (cols(2).toInt >= 4)) yield (cols(0), cols(1)) } def process_movies(lines: List[String]) : List[(String, String)] = { @@ -77,18 +84,8 @@ } } -// -//val ls = List(("1", "a"), ("2", "a"), ("1", "c"), ("2", "a"), ("1", "c")) -// -//val m = groupById(ls, Map()) -// -//m.getOrElse("1", Nil).count(_ == "c") // => 2 -//m.getOrElse("1", Nil).count(_ == "a") // => 1 - // test cases //val ratings_map = groupById(good_ratings, Map()) -//groupById(good_ratings, Map()).get("214") -//groupById(good_ratings, Map()).toList.minBy(_._2.length) //val movies_map = movie_names.toMap //ratings_map.get("414").get.map(movies_map.get(_)) // most prolific recommender with 1227 positive ratings @@ -96,10 +93,9 @@ //ratings_map.get("214").get.map(movies_map.get(_)) // least prolific recommender with only 1 positive rating - //(4) Implement a function that takes a ratings map and a movie_name as argument. // The function calculates all suggestions containing -// the movie mov in its recommendations. It ReTurns a list of all these +// the movie mov in its recommendations. It returns a list of all these // recommendations (each of them is a list and needs to have mov deleted, // otherwise it might happen we recommend the same movie). @@ -124,23 +120,14 @@ // (5) Implement a suggestions function which takes a rating // map and a movie_name as arguments. It calculates all the recommended // movies sorted according to the most frequently suggested movie(s) first. - -// needed in Scala 2.13. - -def mapValues[S, T, R](m: Map[S, T], f: T => R) = - m.map { case (x, y) => (x, f(y)) } - def suggestions(recs: Map[String, List[String]], mov_name: String) : List[String] = { val favs = favourites(recs, mov_name).flatten - val favs_counted = mapValues(favs.groupBy(identity), (v:List[String]) => v.size).toList + val favs_counted = favs.groupBy(identity).view.mapValues(_.size).toList val favs_sorted = favs_counted.sortBy(_._2).reverse favs_sorted.map(_._1) } -// check -// groupMap is equivalent to groupBy(key).mapValues(_.map(f)) - // test cases //suggestions(ratings_map, "912") @@ -163,7 +150,7 @@ // recommendations(ratings_map, movies_map, "912") // => List(Godfather, Star Wars: Episode IV - A NewHope (1977)) -//recommendations(ratings_map, movies_map, "260") +// recommendations(ratings_map, movies_map, "260") // => List(Star Wars: Episode V - The Empire Strikes Back (1980), // Star Wars: Episode VI - Return of the Jedi (1983)) @@ -177,20 +164,53 @@ // => List(Shawshank Redemption, Forrest Gump (1994)) // recommendations(ratings_map, movies_map, "4") -// => Nil (there are three ratings fro this movie in ratings.csv but they are not positive) +// => Nil (there are three ratings for this movie in ratings.csv but they are not positive) + +// (7) Calculate the recommendations for all movies according to +// what the recommendations function in (6) produces (this +// can take a few seconds). Put all recommendations into a list +// (of strings) and count how often the strings occur in +// this list. This produces a list of string-int pairs, +// where the first component is the movie name and the second +// is the number of how many times they were recommended. +// Sort all the pairs according to the number +// of times they were recommended (most recommended movie name +// first). + +def occurrences(xs: List[String]): List[(String, Int)] = + for (x <- xs.distinct) yield (x, xs.count(_ == x)) + +def most_recommended(recs: Map[String, List[String]], + movs: Map[String, String]) : List[(String, Int)] = { + val all = (for (name <- movs.toList.map(_._1)) yield { + recommendations(recs, movs, name) + }).flatten + val occs = occurrences(all) + occs.sortBy(_._2).reverse +} -// If you want to calculate the recomendations for all movies. -// Will take a few seconds calculation time. - -//val all = for (name <- movie_names.map(_._1)) yield { -// recommendations(ratings_map, movies_map, name) -//} - -// helper functions -//List().take(2) -//List(1).take(2) -//List(1,2).take(2) -//List(1,2,3).take(2) +//most_recommended(ratings_map, movies_map).take(3) +// => +// List((Matrix,698), +// (Star Wars: Episode IV - A New Hope (1977),402), +// (Jerry Maguire (1996),382)) } + +//val ratings_url = """https://nms.kcl.ac.uk/christian.urban/ratings.csv""" +//val movies_url = """https://nms.kcl.ac.uk/christian.urban/movies.csv""" + +/* +val ratings = get_csv_url(ratings_url) +val movies = get_csv_url(movies_url) + +val good_ratings = process_ratings(ratings) +val movie_names = process_movies(movies) + +val ratings_map = groupById(good_ratings, Map()) +val movies_map = movie_names.toMap + + +println(most_recommended(ratings_map, movies_map).take(3)) +*/