solutions-resit/resit-sol.scala
changeset 336 25d9c3b2bc99
child 343 c8fcc0e0a57f
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/solutions-resit/resit-sol.scala	Sun Aug 23 14:39:58 2020 +0100
@@ -0,0 +1,216 @@
+// Resit Exam about data-mining a chat log
+//=========================================
+//
+// This 
+//
+
+
+object Resit {
+
+import io.Source
+import scala.util._
+
+
+// (1) The function below takes file name as argument.  It should read
+//     the corresponding file and return its content. The content
+//     should be returned as a list of strings, a string for each line
+//     in the file. Since the file is a csv-file, the first line
+//     should be dropped. Lines are separated by "\n".
+
+
+def get_csv(name: String) : List[String] = {
+  val csv = Source.fromFile(name)("ISO-8859-1")
+  csv.mkString.split("\n").toList.drop(1)
+}
+
+// test cases:
+//
+// get_csv("log.csv")
+// get_csv("log.csv").length  // should be 680
+
+
+// (2) The function below takes a single line from the csv-file (as generated by
+//     get_csv) and creates a Rec(ord) data structure. The data from the csv-file 
+//     should be copied as follows:
+//
+//     csv-file         Rec data structure
+//     -----------------------------------
+//      counter      => num
+//      id           => msg_id 
+//      time_date    => date
+//      name         => author
+//      country,     => country (should be None if no country is given)
+//      parent_id    => reply_id (should be None if there is no parent)
+//      msg          => msg
+//                   => parent is set to None  (will be calculated later)
+//                   => children is set to Nil (will be calculated later)
+//
+//     You should use the function line.split(",").toList to separate
+//     the items in the csv-line. BE CAREFUL that the message texts in 
+//     the last field contain commas and therefore the split will not
+//     always result into a list of 7 elements. You need to concatenate
+//     anything beyond the 7th field into a string for the field msg.
+
+case class Rec(num: Int, 
+               msg_id: String,
+               date: String,
+               msg: String,
+               author: String,
+               country: Option[String],
+               reply_id : Option[String],
+               parent: Option[Int] = None,
+               children: List[Int] = Nil)  
+
+
+
+def process_line(line: String) : Rec = {
+  val strs = line.split(",").toList
+  Rec(num = strs(0).toInt,
+      msg_id = strs(1),
+      date = strs(2),
+      author = strs(3),
+      country = if (strs(4) == "") None else Some(strs(4)),
+      reply_id = if (strs(5) == "") None else Some(strs(5)),
+      msg = (for (i <- 6 until strs.length) yield strs(i)).mkString(","))
+}
+
+
+// test cases:
+//
+// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")
+// 
+//      ==>  Rec(0,
+//               "5ebeb459ac278d01301f1497",
+//               "2020-05-15T15:25:13.413000",
+//               "this question please?",
+//               "participant34",
+//               Some("United Kingdom"),
+//               Some("5ebea6424923321d63155796"),
+//               None,
+//               List())
+//
+// process_line("""1,hash,date,p43,,,foo, bar""")
+//
+//      ==>  Rec(1, "hash", "date", "foo, bar",
+//               "p43", None, None, None, List())
+//
+// (Note that in the last test case the message needs to be "foo, bar")
+
+
+
+// (3) Each record in the log contains a unique hash code
+//     identifying each message. Some messages also contain a hash
+//     code identifying the parent message (to which question they reply).
+//     The function post_process fills in the information about
+//     potential children and a parent message. 
+//  
+//     The auxiliary function get_children takes a record e and a
+//     record list rs as arguments, and returns the list of all direct
+//     children (which have the hash code of e as reply_id. The list
+//     of children are returned as a list of nums.
+//      
+//     The auxiliary function get_parent returns the number of the
+//     record corresponding to the reply_id (if there exists one,
+//     otherwise returns None).
+
+def get_children(e: Rec, rs: List[Rec]) : List[Int] = {
+  (rs.filter(r => r != e &&
+                  Some(e.msg_id) == r.reply_id)).map(_.num)     
+}
+
+def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = {
+  (rs.find(r => r != e &&
+                Some(r.msg_id) == e.reply_id)).map(_.num)     
+}
+
+def post_process(rs: List[Rec]) : List[Rec] =
+  rs.map(r => r.copy(parent = get_parent(r, rs),
+		     children = get_children(r, rs)))
+
+
+// test cases:
+//
+//val recs = get_csv("log.csv").map(process_line)
+//
+//post_process(recs)(4).children   // List(12)
+//post_process(recs)(23).children  // List(16,26)
+//
+//post_process(recs)(8).parent     // None
+//post_process(recs)(9).parent     // Some(7)
+//post_process(recs)(16).parent    // Some(23)
+//post_process(recs)(26).parent    // Some(23)
+
+
+// (4) The next two functions calculate the countries where
+//     message authors are coming from and how many authors
+//     come from each country (returned as a Map from countries
+//     to numbers). In case an author did not specify a country,
+//     the empty string is returned.
+
+def get_countries(rs: List[Rec]) : Set[String] =
+  rs.map(_.country.getOrElse("")).toSet
+
+def get_countries_numbers(rs: List[Rec]) :  Map[String, Int] = {
+  val name_countries = rs.map(r => (r.author, r.country.getOrElse(""))).distinct
+  name_countries.groupBy(_._2).view.mapValues(_.size).toMap
+}
+
+// test cases:
+//
+//val recs = get_csv("log.csv").map(process_line)
+//
+// get_countries(recs) => 
+//
+//    Set("", Poland, Lebanon, Trinidad and
+//        Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong
+//        Kong, Italy, Ireland, Uganda, England, Bangladesh, China,
+//        Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,
+//        Malaysia, Turkey, Portugal, Hungary)
+//
+// get_countries(recs) => 
+//
+//    Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,
+//        Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,
+//        Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,
+//        England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,
+//        Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,
+//        Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,
+//        Hungary -> 2)
+
+ 
+// (5) The function thread_sizes orders the message threads according to 
+//     how many answers were given for one message (that is how many children, 
+//     grand-children and so on one message received).
+//
+//     The auxiliary function search enumerates all children, grand-children and
+//     so on for a given record r. Search returns the children and so on as
+//     a list of Recs. 
+//
+//     The function thread_sizes generates for every message record
+//     a pair
+
+def search(r: Rec, rs: List[Rec]) : List[Rec] = 
+  r :: (r.children.map(c => search(rs(c), rs)).flatten)
+
+def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) =
+  (r.num, search(r, rs).size)
+
+def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = 
+  rs.map(thread_size(_, rs)).sortBy(_._2).reverse
+ 
+
+// test cases: 
+//
+//val recs_p = post_process(get_csv("log.csv").map(process_line))
+//
+//search(recs_p(459), recs_p).map(_.num)
+//    => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)
+//
+//thread_size(recs_p(459), recs_p) 
+//    => (459,10)
+//
+//ordered_thread_sizes(recs_p).take(4)
+//    => List((402,18), (95,12), (488,11), (459,10))
+
+
+}