--- a/solutions-resit/resit-sol.scala Fri Apr 26 17:29:30 2024 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,216 +0,0 @@
-// Resit Exam about data-mining a chat log
-//=========================================
-//
-// This
-//
-
-
-object Resit {
-
-import io.Source
-import scala.util._
-
-
-// (1) The function below takes file name as argument. It should read
-// the corresponding file and return its content. The content
-// should be returned as a list of strings, a string for each line
-// in the file. Since the file is a csv-file, the first line
-// should be dropped. Lines are separated by "\n".
-
-
-def get_csv(name: String) : List[String] = {
- val csv = Source.fromFile(name)("ISO-8859-1")
- csv.mkString.split("\n").toList.drop(1)
-}
-
-// test cases:
-//
-// get_csv("log.csv")
-// get_csv("log.csv").length // should be 680
-
-
-// (2) The function below takes a single line from the csv-file (as generated by
-// get_csv) and creates a Rec(ord) data structure. The data from the csv-file
-// should be copied as follows:
-//
-// csv-file Rec data structure
-// -----------------------------------
-// counter => num
-// id => msg_id
-// time_date => date
-// name => author
-// country, => country (should be None if no country is given)
-// parent_id => reply_id (should be None if there is no parent)
-// msg => msg
-// => parent is set to None (will be calculated later)
-// => children is set to Nil (will be calculated later)
-//
-// You should use the function line.split(",").toList to separate
-// the items in the csv-line. BE CAREFUL that the message texts in
-// the last field contain commas and therefore the split will not
-// always result into a list of 7 elements. You need to concatenate
-// anything beyond the 7th field into a string for the field msg.
-
-case class Rec(num: Int,
- msg_id: String,
- date: String,
- msg: String,
- author: String,
- country: Option[String],
- reply_id : Option[String],
- parent: Option[Int] = None,
- children: List[Int] = Nil)
-
-
-
-def process_line(line: String) : Rec = {
- val strs = line.split(",").toList
- Rec(num = strs(0).toInt,
- msg_id = strs(1),
- date = strs(2),
- author = strs(3),
- country = if (strs(4) == "") None else Some(strs(4)),
- reply_id = if (strs(5) == "") None else Some(strs(5)),
- msg = (for (i <- 6 until strs.length) yield strs(i)).mkString(","))
-}
-
-
-// test cases:
-//
-// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")
-//
-// ==> Rec(0,
-// "5ebeb459ac278d01301f1497",
-// "2020-05-15T15:25:13.413000",
-// "this question please?",
-// "participant34",
-// Some("United Kingdom"),
-// Some("5ebea6424923321d63155796"),
-// None,
-// List())
-//
-// process_line("""1,hash,date,p43,,,foo, bar""")
-//
-// ==> Rec(1, "hash", "date", "foo, bar",
-// "p43", None, None, None, List())
-//
-// (Note that in the last test case the message needs to be "foo, bar")
-
-
-
-// (3) Each record in the log contains a unique hash code
-// identifying each message. Some messages also contain a hash
-// code identifying the parent message (to which question they reply).
-// The function post_process fills in the information about
-// potential children and a parent message.
-//
-// The auxiliary function get_children takes a record e and a
-// record list rs as arguments, and returns the list of all direct
-// children (which have the hash code of e as reply_id. The list
-// of children are returned as a list of nums.
-//
-// The auxiliary function get_parent returns the number of the
-// record corresponding to the reply_id (if there exists one,
-// otherwise returns None).
-
-def get_children(e: Rec, rs: List[Rec]) : List[Int] = {
- (rs.filter(r => r != e &&
- Some(e.msg_id) == r.reply_id)).map(_.num)
-}
-
-def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = {
- (rs.find(r => r != e &&
- Some(r.msg_id) == e.reply_id)).map(_.num)
-}
-
-def post_process(rs: List[Rec]) : List[Rec] =
- rs.map(r => r.copy(parent = get_parent(r, rs),
- children = get_children(r, rs)))
-
-
-// test cases:
-//
-//val recs = get_csv("log.csv").map(process_line)
-//
-//post_process(recs)(4).children // List(12)
-//post_process(recs)(23).children // List(16,26)
-//
-//post_process(recs)(8).parent // None
-//post_process(recs)(9).parent // Some(7)
-//post_process(recs)(16).parent // Some(23)
-//post_process(recs)(26).parent // Some(23)
-
-
-// (4) The next two functions calculate the countries where
-// message authors are coming from and how many authors
-// come from each country (returned as a Map from countries
-// to numbers). In case an author did not specify a country,
-// the empty string is returned.
-
-def get_countries(rs: List[Rec]) : Set[String] =
- rs.map(_.country.getOrElse("")).toSet
-
-def get_countries_numbers(rs: List[Rec]) : Map[String, Int] = {
- val name_countries = rs.map(r => (r.author, r.country.getOrElse(""))).distinct
- name_countries.groupBy(_._2).view.mapValues(_.size).toMap
-}
-
-// test cases:
-//
-//val recs = get_csv("log.csv").map(process_line)
-//
-// get_countries(recs) =>
-//
-// Set("", Poland, Lebanon, Trinidad and
-// Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong
-// Kong, Italy, Ireland, Uganda, England, Bangladesh, China,
-// Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,
-// Malaysia, Turkey, Portugal, Hungary)
-//
-// get_countries_numbers(recs) =>
-//
-// Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,
-// Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,
-// Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,
-// England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,
-// Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,
-// Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,
-// Hungary -> 2)
-
-
-// (5) The function thread_sizes orders the message threads according to
-// how many answers were given for one message (that is how many children,
-// grand-children and so on one message received).
-//
-// The auxiliary function search enumerates all children, grand-children and
-// so on for a given record r. Search returns the children and so on as
-// a list of Recs.
-//
-// The function thread_sizes generates for every message record
-// a pair
-
-def search(r: Rec, rs: List[Rec]) : List[Rec] =
- r :: (r.children.map(c => search(rs(c), rs)).flatten)
-
-def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) =
- (r.num, search(r, rs).size)
-
-def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] =
- rs.map(thread_size(_, rs)).sortBy(_._2).reverse
-
-
-// test cases:
-//
-//val recs_p = post_process(get_csv("log.csv").map(process_line))
-//
-//search(recs_p(459), recs_p).map(_.num)
-// => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)
-//
-//thread_size(recs_p(459), recs_p)
-// => (459,10)
-//
-//ordered_thread_sizes(recs_p).take(4)
-// => List((402,18), (95,12), (488,11), (459,10))
-
-
-}