diff -r 7e00d2b13b04 -r 25d9c3b2bc99 solutions-resit/resit.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/solutions-resit/resit.scala Sun Aug 23 14:39:58 2020 +0100 @@ -0,0 +1,203 @@ +// Resit Exam about data-mining a chat log +//========================================= +// +// This coursework is about mining a log of an online chat between 85 +// participants. The log is given as a csv-list in the file +// log.csv. The log is an unordered list containing information which +// message has been sent, by whom, when and in response to which other +// message. Each message has also a number and a unique hash code. +// +// !! For further information abiout the tasks, see: !! +// !! !! +// !! https://nms.kcl.ac.uk/christian.urban/cw-resit.pdf !! + +object Resit { + +import io.Source +import scala.util._ + +//============= +// (1) The function get_csv takes file name as argument. It should read +// the corresponding file and return its content. The content should +// be returned as a list of strings, a string for each line in the +// file. Since the file is a csv-file, the first line (the header) +// should be dropped. Lines are separated by "\n". + + +def get_csv(name: String) : List[String] = ... + +// test cases: +// +// get_csv("log.csv") +// get_csv("log.csv").length // should be 680 + + +//============= +// (2) The function below takes a single line from the csv-file (as +// generated by get_csv) and creates a Rec(ord) data structure. The +// data from the csv-file should be copied as follows: +// +// csv-file Rec data structure +// ----------------------------------- +// counter => num +// id => msg_id +// time_date => date +// name => author +// country => country (should be None if no country is given) +// parent_id => reply_id (should be None if there is no parent) +// msg => msg +// => parent is set to None (will be calculated later) +// => children is set to Nil (will be calculated later) +// +// You should use the function line.split(",").toList to separate +// the items in the csv-line. BE CAREFUL that the message text in +// the last field can contain commas and therefore the split will not +// always result in a list of 7 elements. You need to concatenate +// anything beyond the 7th field into a single string for the field msg. + +case class Rec(num: Int, + msg_id: String, + date: String, + msg: String, + author: String, + country: Option[String], + reply_id : Option[String], + parent: Option[Int] = None, + children: List[Int] = Nil) + + + +def process_line(line: String) : Rec = ... + + +// test cases: +// +// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""") +// +// ==> Rec(0, +// "5ebeb459ac278d01301f1497", +// "2020-05-15T15:25:13.413000", +// "this question please?", +// "participant34", +// Some("United Kingdom"), +// Some("5ebea6424923321d63155796"), +// None, +// List()) +// +// process_line("""1,hash,date,p43,,,foo, bar""") +// +// ==> Rec(1, "hash", "date", "foo, bar", +// "p43", None, None, None, List()) +// +// (Note that in the second test case the message needs to be "foo, bar") + + +//============= +// (3) Each record in the log contains a unique hash code +// identifying each message. Some messages also contain a hash +// code identifying the parent message (to which question they reply). +// The function post_process fills in the information about +// potential children and a potential parent message. +// +// The auxiliary function get_children takes a record e and a +// record list rs as arguments, and returns the list of all direct +// children (which have the hash code of e as reply_id). The list +// of children are returned as a list of nums. +// +// The auxiliary function get_parent returns the number of the +// record corresponding to the reply_id (if there exists one, +// otherwise returns None). + +def get_children(e: Rec, rs: List[Rec]) : List[Int] = ... + +def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = ... + +def post_process(rs: List[Rec]) : List[Rec] = ... + + +// test cases: +// +//val recs = get_csv("log.csv").map(process_line) +// +//post_process(recs)(4).children // List(12) +//post_process(recs)(23).children // List(16,26) +// +//post_process(recs)(8).parent // None +//post_process(recs)(9).parent // Some(7) +//post_process(recs)(16).parent // Some(23) +//post_process(recs)(26).parent // Some(23) + + +//============= +// (4) The next two functions calculate the countries where +// message authors are coming from and how many authors +// come from each country (returned as a Map from countries +// to numbers). In case an author did not specify a country, +// the empty string is returned. + +def get_countries(rs: List[Rec]) : Set[String] = ... + +def get_countries_numbers(rs: List[Rec]) : Map[String, Int] = ... + +// test cases: +// +//val recs = get_csv("log.csv").map(process_line) +// +// get_countries(recs) => +// +// Set("", Poland, Lebanon, Trinidad and +// Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong +// Kong, Italy, Ireland, Uganda, England, Bangladesh, China, +// Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia, +// Malaysia, Turkey, Portugal, Hungary) +// +// get_countries(recs) => +// +// Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2, +// Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6, +// Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1, +// England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3, +// Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2, +// Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1, +// Hungary -> 2) + + +//============= + +// (5) The function ordered_thread_sizes orders the message threads +// according to how many answers were given for one message (that is +// how many children, grand-children and so on one message has). +// +// The auxiliary function search enumerates all children, +// grand-children and so on for a given record r (including the record +// itself). Search returns the children and so on as a list of Recs. +// +// The function thread_size generates for a record, say r, a pair +// consisting of the number of r and the number of all children as +// produced by search. +// +// The function ordered_thread_sizes orders than the list of pairs +// according to which thread in the chat is the longest. + +def search(r: Rec, rs: List[Rec]) : List[Rec] = ... + +def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) = ... + +def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = ... + + +// test cases: +// +//val recs_p = post_process(get_csv("log.csv").map(process_line)) +// +//search(recs_p(459), recs_p).map(_.num) +// => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304) +// +//thread_size(recs_p(459), recs_p) +// => (459,10) +// +//ordered_thread_sizes(recs_p).take(4) +// => List((402,18), (95,12), (488,11), (459,10)) + + +}