pep-material: solutions2/danube.scala@ed29d988b984 (annotated)

284 fc20e5f83f0e updated Christian Urban <urbanc@in.tum.de> parents: 209 diff changeset	1	// Core Part about Movie Recommendations
209 402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	2	// at Danube.co.uk
284 fc20e5f83f0e updated Christian Urban <urbanc@in.tum.de> parents: 209 diff changeset	3	//========================================
209 402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	4
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	5	import io.Source
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	6	import scala.util._
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	7
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	8	object CW7b { // for purposes of generating a jar
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	9
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	10	// (1) Implement the function get_csv_url which takes an url-string
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	11	// as argument and requests the corresponding file. The two urls
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	12	// of interest are ratings_url and movies_url, which correspond
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	13	// to CSV-files.
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	14	// The function should return the CSV file appropriately broken
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	15	// up into lines, and the first line should be dropped (that is without
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	16	// the header of the CSV file). The result is a list of strings (lines
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	17	// in the file).
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	18
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	19	def get_csv_url(url: String) : List[String] = {
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	20	val csv = Source.fromURL(url)("ISO-8859-1")
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	21	csv.mkString.split("\n").toList.drop(1)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	22	}
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	23
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	24	val ratings_url = """https://nms.kcl.ac.uk/christian.urban/ratings.csv"""
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	25	val movies_url = """https://nms.kcl.ac.uk/christian.urban/movies.csv"""
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	26
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	27	// test cases
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	28
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	29	//val ratings = get_csv_url(ratings_url)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	30	//val movies = get_csv_url(movies_url)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	31
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	32	//ratings.length // 87313
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	33	//movies.length // 9742
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	34
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	35	// (2) Implement two functions that process the CSV files. The ratings
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	36	// function filters out all ratings below 4 and returns a list of
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	37	// (userID, movieID) pairs. The movies function just returns a list
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	38	// of (movieId, title) pairs.
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	39
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	40
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	41	def process_ratings(lines: List[String]) : List[(String, String)] = {
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	42	for (cols <- lines.map(_.split(",").toList);
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	43	if (cols(2).toFloat >= 4)) yield (cols(0), cols(1))
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	44	}
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	45
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	46	def process_movies(lines: List[String]) : List[(String, String)] = {
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	47	for (cols <- lines.map(_.split(",").toList)) yield (cols(0), cols(1))
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	48	}
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	49
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	50	// test cases
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	51
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	52	//val good_ratings = process_ratings(ratings)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	53	//val movie_names = process_movies(movies)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	54
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	55	//good_ratings.length //48580
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	56	//movie_names.length // 9742
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	57
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	58	//==============================================
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	59	// Do not change anything below, unless you want
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	60	// to submit the file for the advanced part 3!
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	61	//==============================================
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	62
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	63
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	64	// (3) Implement a grouping function that calulates a map
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	65	// containing the userIds and all the corresponding recommendations
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	66	// (list of movieIds). This should be implemented in a tail
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	67	// recursive fashion, using a map m as accumulator. This map
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	68	// is set to Map() at the beginning of the claculation.
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	69
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	70	def groupById(ratings: List[(String, String)],
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	71	m: Map[String, List[String]]) : Map[String, List[String]] = ratings match {
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	72	case Nil => m
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	73	case (id, mov) :: rest => {
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	74	val old_ratings = m.getOrElse (id, Nil)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	75	val new_ratings = m + (id -> (mov :: old_ratings))
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	76	groupById(rest, new_ratings)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	77	}
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	78	}
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	79
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	80	// test cases
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	81	//val ratings_map = groupById(good_ratings, Map())
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	82	//val movies_map = movie_names.toMap
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	83
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	84	//ratings_map.get("414").get.map(movies_map.get(_)) // most prolific recommender with 1227 positive ratings
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	85	//ratings_map.get("474").get.map(movies_map.get(_)) // second-most prolific recommender with 787 positive ratings
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	86	//ratings_map.get("214").get.map(movies_map.get(_)) // least prolific recommender with only 1 positive rating
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	87
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	88
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	89
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	90	//(4) Implement a function that takes a ratings map and a movie_name as argument.
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	91	// The function calculates all suggestions containing
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	92	// the movie mov in its recommendations. It returns a list of all these
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	93	// recommendations (each of them is a list and needs to have mov deleted,
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	94	// otherwise it might happen we recommend the same movie).
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	95
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	96	def favourites(m: Map[String, List[String]], mov: String) : List[List[String]] =
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	97	(for (id <- m.keys.toList;
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	98	if m(id).contains(mov)) yield m(id).filter(_ != mov))
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	99
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	100
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	101
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	102	// test cases
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	103	// movie ID "912" -> Casablanca (1942)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	104	// "858" -> Godfather
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	105	// "260" -> Star Wars: Episode IV - A New Hope (1977)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	106
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	107	//favourites(ratings_map, "912").length // => 80
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	108
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	109	// That means there are 80 users that recommend the movie with ID 912.
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	110	// Of these 80 users, 55 gave a good rating to movie 858 and
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	111	// 52 a good rating to movies 260, 318, 593.
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	112
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	113
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	114	// (5) Implement a suggestions function which takes a rating
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	115	// map and a movie_name as arguments. It calculates all the recommended
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	116	// movies sorted according to the most frequently suggested movie(s) first.
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	117	def suggestions(recs: Map[String, List[String]],
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	118	mov_name: String) : List[String] = {
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	119	val favs = favourites(recs, mov_name).flatten
284 fc20e5f83f0e updated Christian Urban <urbanc@in.tum.de> parents: 209 diff changeset	120	val favs_counted = favs.groupBy(identity).view.mapValues(_.size).toList
209 402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	121	val favs_sorted = favs_counted.sortBy(_._2).reverse
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	122	favs_sorted.map(_._1)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	123	}
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	124
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	125	// test cases
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	126
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	127	//suggestions(ratings_map, "912")
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	128	//suggestions(ratings_map, "912").length
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	129	// => 4110 suggestions with List(858, 260, 318, 593, ...)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	130	// being the most frequently suggested movies
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	131
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	132	// (6) Implement recommendations functions which generates at most
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	133	// two of the most frequently suggested movies. It Returns the
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	134	// actual movie names, not the movieIDs.
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	135
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	136	def recommendations(recs: Map[String, List[String]],
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	137	movs: Map[String, String],
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	138	mov_name: String) : List[String] =
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	139	suggestions(recs, mov_name).take(2).map(movs.get(_).get)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	140
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	141
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	142	// testcases
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	143
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	144	// recommendations(ratings_map, movies_map, "912")
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	145	// => List(Godfather, Star Wars: Episode IV - A NewHope (1977))
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	146
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	147	//recommendations(ratings_map, movies_map, "260")
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	148	// => List(Star Wars: Episode V - The Empire Strikes Back (1980),
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	149	// Star Wars: Episode VI - Return of the Jedi (1983))
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	150
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	151	// recommendations(ratings_map, movies_map, "2")
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	152	// => List(Lion King, Jurassic Park (1993))
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	153
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	154	// recommendations(ratings_map, movies_map, "0")
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	155	// => Nil
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	156
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	157	// recommendations(ratings_map, movies_map, "1")
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	158	// => List(Shawshank Redemption, Forrest Gump (1994))
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	159
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	160	// recommendations(ratings_map, movies_map, "4")
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	161	// => Nil (there are three ratings fro this movie in ratings.csv but they are not positive)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	162
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	163
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	164	// If you want to calculate the recomendations for all movies.
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	165	// Will take a few seconds calculation time.
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	166
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	167	//val all = for (name <- movie_names.map(_._1)) yield {
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	168	// recommendations(ratings_map, movies_map, name)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	169	//}
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	170
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	171	// helper functions
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	172	//List().take(2
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	173	//List(1).take(2)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	174	//List(1,2).take(2)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	175	//List(1,2,3).take(2)
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	176
402a5fe4abb7 updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	177	}

author	Christian Urban <urbanc@in.tum.de>
	Sat, 02 Nov 2019 21:23:42 +0000
changeset 307	ed29d988b984
parent 284	fc20e5f83f0e
child 325	26058bf089ae
permissions	-rw-r--r--