// Resit Exam about data-mining a chat log+ −
//=========================================+ −
//+ −
// This coursework is about mining a log of an online chat between 85+ −
// participants. The log is given as a csv-list in the file+ −
// log.csv. The log is an unordered list containing information which+ −
// message has been sent, by whom, when and in response to which other+ −
// message. Each message has also a number and a unique hash code.+ −
//+ −
// !! For further information abiout the tasks, see: !!+ −
// !! !!+ −
// !! https://nms.kcl.ac.uk/christian.urban/cw-resit.pdf !!+ −
+ −
object Resit {+ −
+ −
import io.Source+ −
import scala.util._+ −
+ −
//=============+ −
// (1) The function get_csv takes file name as argument. It should read+ −
// the corresponding file and return its content. The content should+ −
// be returned as a list of strings, a string for each line in the+ −
// file. Since the file is a csv-file, the first line (the header)+ −
// should be dropped. Lines are separated by "\n".+ −
+ −
+ −
def get_csv(name: String) : List[String] = ...+ −
+ −
// test cases:+ −
//+ −
// get_csv("log.csv")+ −
// get_csv("log.csv").length // should be 680+ −
+ −
+ −
//=============+ −
// (2) The function below takes a single line from the csv-file (as+ −
// generated by get_csv) and creates a Rec(ord) data structure. The+ −
// data from the csv-file should be copied as follows:+ −
//+ −
// csv-file Rec data structure+ −
// -----------------------------------+ −
// counter => num+ −
// id => msg_id + −
// time_date => date+ −
// name => author+ −
// country => country (should be None if no country is given)+ −
// parent_id => reply_id (should be None if there is no parent)+ −
// msg => msg+ −
// => parent is set to None (will be calculated later)+ −
// => children is set to Nil (will be calculated later)+ −
//+ −
// You should use the function line.split(",").toList to separate+ −
// the items in the csv-line. BE CAREFUL that the message text in + −
// the last field can contain commas and therefore the split will not+ −
// always result in a list of 7 elements. You need to concatenate+ −
// anything beyond the 7th field into a single string for the field msg.+ −
+ −
case class Rec(num: Int, + −
msg_id: String,+ −
date: String,+ −
msg: String,+ −
author: String,+ −
country: Option[String],+ −
reply_id : Option[String],+ −
parent: Option[Int] = None,+ −
children: List[Int] = Nil) + −
+ −
+ −
+ −
def process_line(line: String) : Rec = ...+ −
+ −
+ −
// test cases:+ −
//+ −
// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")+ −
// + −
// ==> Rec(0,+ −
// "5ebeb459ac278d01301f1497",+ −
// "2020-05-15T15:25:13.413000",+ −
// "this question please?",+ −
// "participant34",+ −
// Some("United Kingdom"),+ −
// Some("5ebea6424923321d63155796"),+ −
// None,+ −
// List())+ −
//+ −
// process_line("""1,hash,date,p43,,,foo, bar""")+ −
//+ −
// ==> Rec(1, "hash", "date", "foo, bar",+ −
// "p43", None, None, None, List())+ −
//+ −
// (Note that in the second test case the message needs to be "foo, bar")+ −
+ −
+ −
//=============+ −
// (3) Each record in the log contains a unique hash code+ −
// identifying each message. Some messages also contain a hash+ −
// code identifying the parent message (to which question they reply).+ −
// The function post_process fills in the information about+ −
// potential children and a potential parent message. + −
// + −
// The auxiliary function get_children takes a record e and a+ −
// record list rs as arguments, and returns the list of all direct+ −
// children (which have the hash code of e as reply_id). The list+ −
// of children are returned as a list of nums.+ −
// + −
// The auxiliary function get_parent returns the number of the+ −
// record corresponding to the reply_id (if there exists one,+ −
// otherwise returns None).+ −
+ −
def get_children(e: Rec, rs: List[Rec]) : List[Int] = ...+ −
+ −
def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = ...+ −
+ −
def post_process(rs: List[Rec]) : List[Rec] = ...+ −
+ −
+ −
// test cases:+ −
//+ −
//val recs = get_csv("log.csv").map(process_line)+ −
//+ −
//post_process(recs)(4).children // List(12)+ −
//post_process(recs)(23).children // List(16,26)+ −
//+ −
//post_process(recs)(8).parent // None+ −
//post_process(recs)(9).parent // Some(7)+ −
//post_process(recs)(16).parent // Some(23)+ −
//post_process(recs)(26).parent // Some(23)+ −
+ −
+ −
//=============+ −
// (4) The next two functions calculate the countries where+ −
// message authors are coming from and how many authors+ −
// come from each country (returned as a Map from countries+ −
// to numbers). In case an author did not specify a country,+ −
// the empty string is returned.+ −
+ −
def get_countries(rs: List[Rec]) : Set[String] = ...+ −
+ −
def get_countries_numbers(rs: List[Rec]) : Map[String, Int] = ...+ −
+ −
// test cases:+ −
//+ −
//val recs = get_csv("log.csv").map(process_line)+ −
//+ −
// get_countries(recs) => + −
//+ −
// Set("", Poland, Lebanon, Trinidad and+ −
// Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong+ −
// Kong, Italy, Ireland, Uganda, England, Bangladesh, China,+ −
// Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,+ −
// Malaysia, Turkey, Portugal, Hungary)+ −
//+ −
// get_countries_numbers(recs) => + −
//+ −
// Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,+ −
// Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,+ −
// Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,+ −
// England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,+ −
// Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,+ −
// Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,+ −
// Hungary -> 2)+ −
+ −
+ −
//============= + −
+ −
// (5) The function ordered_thread_sizes orders the message threads+ −
// according to how many answers were given for one message (that is+ −
// how many children, grand-children and so on one message has).+ −
//+ −
// The auxiliary function search enumerates all children,+ −
// grand-children and so on for a given record r (including the record+ −
// itself). Search returns the children and so on as a list of Recs.+ −
//+ −
// The function thread_size generates for a record, say r, a pair+ −
// consisting of the number of r and the number of all children as+ −
// produced by search. + −
//+ −
// The function ordered_thread_sizes orders than the list of pairs+ −
// according to which thread in the chat is the longest.+ −
+ −
def search(r: Rec, rs: List[Rec]) : List[Rec] = ...+ −
+ −
def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) = ...+ −
+ −
def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = ...+ −
+ −
+ −
// test cases: + −
//+ −
//val recs_p = post_process(get_csv("log.csv").map(process_line))+ −
//+ −
//search(recs_p(459), recs_p).map(_.num)+ −
// => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)+ −
//+ −
//thread_size(recs_p(459), recs_p) + −
// => (459,10)+ −
//+ −
//ordered_thread_sizes(recs_p).take(4)+ −
// => List((402,18), (95,12), (488,11), (459,10))+ −
+ −
+ −
}+ −