diff -r 7e00d2b13b04 -r 25d9c3b2bc99 solutions-resit/resit-sol.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/solutions-resit/resit-sol.scala Sun Aug 23 14:39:58 2020 +0100 @@ -0,0 +1,216 @@ +// Resit Exam about data-mining a chat log +//========================================= +// +// This +// + + +object Resit { + +import io.Source +import scala.util._ + + +// (1) The function below takes file name as argument. It should read +// the corresponding file and return its content. The content +// should be returned as a list of strings, a string for each line +// in the file. Since the file is a csv-file, the first line +// should be dropped. Lines are separated by "\n". + + +def get_csv(name: String) : List[String] = { + val csv = Source.fromFile(name)("ISO-8859-1") + csv.mkString.split("\n").toList.drop(1) +} + +// test cases: +// +// get_csv("log.csv") +// get_csv("log.csv").length // should be 680 + + +// (2) The function below takes a single line from the csv-file (as generated by +// get_csv) and creates a Rec(ord) data structure. The data from the csv-file +// should be copied as follows: +// +// csv-file Rec data structure +// ----------------------------------- +// counter => num +// id => msg_id +// time_date => date +// name => author +// country, => country (should be None if no country is given) +// parent_id => reply_id (should be None if there is no parent) +// msg => msg +// => parent is set to None (will be calculated later) +// => children is set to Nil (will be calculated later) +// +// You should use the function line.split(",").toList to separate +// the items in the csv-line. BE CAREFUL that the message texts in +// the last field contain commas and therefore the split will not +// always result into a list of 7 elements. You need to concatenate +// anything beyond the 7th field into a string for the field msg. + +case class Rec(num: Int, + msg_id: String, + date: String, + msg: String, + author: String, + country: Option[String], + reply_id : Option[String], + parent: Option[Int] = None, + children: List[Int] = Nil) + + + +def process_line(line: String) : Rec = { + val strs = line.split(",").toList + Rec(num = strs(0).toInt, + msg_id = strs(1), + date = strs(2), + author = strs(3), + country = if (strs(4) == "") None else Some(strs(4)), + reply_id = if (strs(5) == "") None else Some(strs(5)), + msg = (for (i <- 6 until strs.length) yield strs(i)).mkString(",")) +} + + +// test cases: +// +// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""") +// +// ==> Rec(0, +// "5ebeb459ac278d01301f1497", +// "2020-05-15T15:25:13.413000", +// "this question please?", +// "participant34", +// Some("United Kingdom"), +// Some("5ebea6424923321d63155796"), +// None, +// List()) +// +// process_line("""1,hash,date,p43,,,foo, bar""") +// +// ==> Rec(1, "hash", "date", "foo, bar", +// "p43", None, None, None, List()) +// +// (Note that in the last test case the message needs to be "foo, bar") + + + +// (3) Each record in the log contains a unique hash code +// identifying each message. Some messages also contain a hash +// code identifying the parent message (to which question they reply). +// The function post_process fills in the information about +// potential children and a parent message. +// +// The auxiliary function get_children takes a record e and a +// record list rs as arguments, and returns the list of all direct +// children (which have the hash code of e as reply_id. The list +// of children are returned as a list of nums. +// +// The auxiliary function get_parent returns the number of the +// record corresponding to the reply_id (if there exists one, +// otherwise returns None). + +def get_children(e: Rec, rs: List[Rec]) : List[Int] = { + (rs.filter(r => r != e && + Some(e.msg_id) == r.reply_id)).map(_.num) +} + +def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = { + (rs.find(r => r != e && + Some(r.msg_id) == e.reply_id)).map(_.num) +} + +def post_process(rs: List[Rec]) : List[Rec] = + rs.map(r => r.copy(parent = get_parent(r, rs), + children = get_children(r, rs))) + + +// test cases: +// +//val recs = get_csv("log.csv").map(process_line) +// +//post_process(recs)(4).children // List(12) +//post_process(recs)(23).children // List(16,26) +// +//post_process(recs)(8).parent // None +//post_process(recs)(9).parent // Some(7) +//post_process(recs)(16).parent // Some(23) +//post_process(recs)(26).parent // Some(23) + + +// (4) The next two functions calculate the countries where +// message authors are coming from and how many authors +// come from each country (returned as a Map from countries +// to numbers). In case an author did not specify a country, +// the empty string is returned. + +def get_countries(rs: List[Rec]) : Set[String] = + rs.map(_.country.getOrElse("")).toSet + +def get_countries_numbers(rs: List[Rec]) : Map[String, Int] = { + val name_countries = rs.map(r => (r.author, r.country.getOrElse(""))).distinct + name_countries.groupBy(_._2).view.mapValues(_.size).toMap +} + +// test cases: +// +//val recs = get_csv("log.csv").map(process_line) +// +// get_countries(recs) => +// +// Set("", Poland, Lebanon, Trinidad and +// Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong +// Kong, Italy, Ireland, Uganda, England, Bangladesh, China, +// Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia, +// Malaysia, Turkey, Portugal, Hungary) +// +// get_countries(recs) => +// +// Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2, +// Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6, +// Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1, +// England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3, +// Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2, +// Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1, +// Hungary -> 2) + + +// (5) The function thread_sizes orders the message threads according to +// how many answers were given for one message (that is how many children, +// grand-children and so on one message received). +// +// The auxiliary function search enumerates all children, grand-children and +// so on for a given record r. Search returns the children and so on as +// a list of Recs. +// +// The function thread_sizes generates for every message record +// a pair + +def search(r: Rec, rs: List[Rec]) : List[Rec] = + r :: (r.children.map(c => search(rs(c), rs)).flatten) + +def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) = + (r.num, search(r, rs).size) + +def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = + rs.map(thread_size(_, rs)).sortBy(_._2).reverse + + +// test cases: +// +//val recs_p = post_process(get_csv("log.csv").map(process_line)) +// +//search(recs_p(459), recs_p).map(_.num) +// => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304) +// +//thread_size(recs_p(459), recs_p) +// => (459,10) +// +//ordered_thread_sizes(recs_p).take(4) +// => List((402,18), (95,12), (488,11), (459,10)) + + +}