diff -r 19b75e899d37 -r 9c03b5e89a2a solutions-resit/resit-sol.scala --- a/solutions-resit/resit-sol.scala Fri Apr 26 17:29:30 2024 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,216 +0,0 @@ -// Resit Exam about data-mining a chat log -//========================================= -// -// This -// - - -object Resit { - -import io.Source -import scala.util._ - - -// (1) The function below takes file name as argument. It should read -// the corresponding file and return its content. The content -// should be returned as a list of strings, a string for each line -// in the file. Since the file is a csv-file, the first line -// should be dropped. Lines are separated by "\n". - - -def get_csv(name: String) : List[String] = { - val csv = Source.fromFile(name)("ISO-8859-1") - csv.mkString.split("\n").toList.drop(1) -} - -// test cases: -// -// get_csv("log.csv") -// get_csv("log.csv").length // should be 680 - - -// (2) The function below takes a single line from the csv-file (as generated by -// get_csv) and creates a Rec(ord) data structure. The data from the csv-file -// should be copied as follows: -// -// csv-file Rec data structure -// ----------------------------------- -// counter => num -// id => msg_id -// time_date => date -// name => author -// country, => country (should be None if no country is given) -// parent_id => reply_id (should be None if there is no parent) -// msg => msg -// => parent is set to None (will be calculated later) -// => children is set to Nil (will be calculated later) -// -// You should use the function line.split(",").toList to separate -// the items in the csv-line. BE CAREFUL that the message texts in -// the last field contain commas and therefore the split will not -// always result into a list of 7 elements. You need to concatenate -// anything beyond the 7th field into a string for the field msg. - -case class Rec(num: Int, - msg_id: String, - date: String, - msg: String, - author: String, - country: Option[String], - reply_id : Option[String], - parent: Option[Int] = None, - children: List[Int] = Nil) - - - -def process_line(line: String) : Rec = { - val strs = line.split(",").toList - Rec(num = strs(0).toInt, - msg_id = strs(1), - date = strs(2), - author = strs(3), - country = if (strs(4) == "") None else Some(strs(4)), - reply_id = if (strs(5) == "") None else Some(strs(5)), - msg = (for (i <- 6 until strs.length) yield strs(i)).mkString(",")) -} - - -// test cases: -// -// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""") -// -// ==> Rec(0, -// "5ebeb459ac278d01301f1497", -// "2020-05-15T15:25:13.413000", -// "this question please?", -// "participant34", -// Some("United Kingdom"), -// Some("5ebea6424923321d63155796"), -// None, -// List()) -// -// process_line("""1,hash,date,p43,,,foo, bar""") -// -// ==> Rec(1, "hash", "date", "foo, bar", -// "p43", None, None, None, List()) -// -// (Note that in the last test case the message needs to be "foo, bar") - - - -// (3) Each record in the log contains a unique hash code -// identifying each message. Some messages also contain a hash -// code identifying the parent message (to which question they reply). -// The function post_process fills in the information about -// potential children and a parent message. -// -// The auxiliary function get_children takes a record e and a -// record list rs as arguments, and returns the list of all direct -// children (which have the hash code of e as reply_id. The list -// of children are returned as a list of nums. -// -// The auxiliary function get_parent returns the number of the -// record corresponding to the reply_id (if there exists one, -// otherwise returns None). - -def get_children(e: Rec, rs: List[Rec]) : List[Int] = { - (rs.filter(r => r != e && - Some(e.msg_id) == r.reply_id)).map(_.num) -} - -def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = { - (rs.find(r => r != e && - Some(r.msg_id) == e.reply_id)).map(_.num) -} - -def post_process(rs: List[Rec]) : List[Rec] = - rs.map(r => r.copy(parent = get_parent(r, rs), - children = get_children(r, rs))) - - -// test cases: -// -//val recs = get_csv("log.csv").map(process_line) -// -//post_process(recs)(4).children // List(12) -//post_process(recs)(23).children // List(16,26) -// -//post_process(recs)(8).parent // None -//post_process(recs)(9).parent // Some(7) -//post_process(recs)(16).parent // Some(23) -//post_process(recs)(26).parent // Some(23) - - -// (4) The next two functions calculate the countries where -// message authors are coming from and how many authors -// come from each country (returned as a Map from countries -// to numbers). In case an author did not specify a country, -// the empty string is returned. - -def get_countries(rs: List[Rec]) : Set[String] = - rs.map(_.country.getOrElse("")).toSet - -def get_countries_numbers(rs: List[Rec]) : Map[String, Int] = { - val name_countries = rs.map(r => (r.author, r.country.getOrElse(""))).distinct - name_countries.groupBy(_._2).view.mapValues(_.size).toMap -} - -// test cases: -// -//val recs = get_csv("log.csv").map(process_line) -// -// get_countries(recs) => -// -// Set("", Poland, Lebanon, Trinidad and -// Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong -// Kong, Italy, Ireland, Uganda, England, Bangladesh, China, -// Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia, -// Malaysia, Turkey, Portugal, Hungary) -// -// get_countries_numbers(recs) => -// -// Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2, -// Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6, -// Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1, -// England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3, -// Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2, -// Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1, -// Hungary -> 2) - - -// (5) The function thread_sizes orders the message threads according to -// how many answers were given for one message (that is how many children, -// grand-children and so on one message received). -// -// The auxiliary function search enumerates all children, grand-children and -// so on for a given record r. Search returns the children and so on as -// a list of Recs. -// -// The function thread_sizes generates for every message record -// a pair - -def search(r: Rec, rs: List[Rec]) : List[Rec] = - r :: (r.children.map(c => search(rs(c), rs)).flatten) - -def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) = - (r.num, search(r, rs).size) - -def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = - rs.map(thread_size(_, rs)).sortBy(_._2).reverse - - -// test cases: -// -//val recs_p = post_process(get_csv("log.csv").map(process_line)) -// -//search(recs_p(459), recs_p).map(_.num) -// => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304) -// -//thread_size(recs_p(459), recs_p) -// => (459,10) -// -//ordered_thread_sizes(recs_p).take(4) -// => List((402,18), (95,12), (488,11), (459,10)) - - -}