--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/solutions-resit/resit.scala Sun Aug 23 14:39:58 2020 +0100
@@ -0,0 +1,203 @@
+// Resit Exam about data-mining a chat log
+//=========================================
+//
+// This coursework is about mining a log of an online chat between 85
+// participants. The log is given as a csv-list in the file
+// log.csv. The log is an unordered list containing information which
+// message has been sent, by whom, when and in response to which other
+// message. Each message has also a number and a unique hash code.
+//
+// !! For further information abiout the tasks, see: !!
+// !! !!
+// !! https://nms.kcl.ac.uk/christian.urban/cw-resit.pdf !!
+
+object Resit {
+
+import io.Source
+import scala.util._
+
+//=============
+// (1) The function get_csv takes file name as argument. It should read
+// the corresponding file and return its content. The content should
+// be returned as a list of strings, a string for each line in the
+// file. Since the file is a csv-file, the first line (the header)
+// should be dropped. Lines are separated by "\n".
+
+
+def get_csv(name: String) : List[String] = ...
+
+// test cases:
+//
+// get_csv("log.csv")
+// get_csv("log.csv").length // should be 680
+
+
+//=============
+// (2) The function below takes a single line from the csv-file (as
+// generated by get_csv) and creates a Rec(ord) data structure. The
+// data from the csv-file should be copied as follows:
+//
+// csv-file Rec data structure
+// -----------------------------------
+// counter => num
+// id => msg_id
+// time_date => date
+// name => author
+// country => country (should be None if no country is given)
+// parent_id => reply_id (should be None if there is no parent)
+// msg => msg
+// => parent is set to None (will be calculated later)
+// => children is set to Nil (will be calculated later)
+//
+// You should use the function line.split(",").toList to separate
+// the items in the csv-line. BE CAREFUL that the message text in
+// the last field can contain commas and therefore the split will not
+// always result in a list of 7 elements. You need to concatenate
+// anything beyond the 7th field into a single string for the field msg.
+
+case class Rec(num: Int,
+ msg_id: String,
+ date: String,
+ msg: String,
+ author: String,
+ country: Option[String],
+ reply_id : Option[String],
+ parent: Option[Int] = None,
+ children: List[Int] = Nil)
+
+
+
+def process_line(line: String) : Rec = ...
+
+
+// test cases:
+//
+// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")
+//
+// ==> Rec(0,
+// "5ebeb459ac278d01301f1497",
+// "2020-05-15T15:25:13.413000",
+// "this question please?",
+// "participant34",
+// Some("United Kingdom"),
+// Some("5ebea6424923321d63155796"),
+// None,
+// List())
+//
+// process_line("""1,hash,date,p43,,,foo, bar""")
+//
+// ==> Rec(1, "hash", "date", "foo, bar",
+// "p43", None, None, None, List())
+//
+// (Note that in the second test case the message needs to be "foo, bar")
+
+
+//=============
+// (3) Each record in the log contains a unique hash code
+// identifying each message. Some messages also contain a hash
+// code identifying the parent message (to which question they reply).
+// The function post_process fills in the information about
+// potential children and a potential parent message.
+//
+// The auxiliary function get_children takes a record e and a
+// record list rs as arguments, and returns the list of all direct
+// children (which have the hash code of e as reply_id). The list
+// of children are returned as a list of nums.
+//
+// The auxiliary function get_parent returns the number of the
+// record corresponding to the reply_id (if there exists one,
+// otherwise returns None).
+
+def get_children(e: Rec, rs: List[Rec]) : List[Int] = ...
+
+def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = ...
+
+def post_process(rs: List[Rec]) : List[Rec] = ...
+
+
+// test cases:
+//
+//val recs = get_csv("log.csv").map(process_line)
+//
+//post_process(recs)(4).children // List(12)
+//post_process(recs)(23).children // List(16,26)
+//
+//post_process(recs)(8).parent // None
+//post_process(recs)(9).parent // Some(7)
+//post_process(recs)(16).parent // Some(23)
+//post_process(recs)(26).parent // Some(23)
+
+
+//=============
+// (4) The next two functions calculate the countries where
+// message authors are coming from and how many authors
+// come from each country (returned as a Map from countries
+// to numbers). In case an author did not specify a country,
+// the empty string is returned.
+
+def get_countries(rs: List[Rec]) : Set[String] = ...
+
+def get_countries_numbers(rs: List[Rec]) : Map[String, Int] = ...
+
+// test cases:
+//
+//val recs = get_csv("log.csv").map(process_line)
+//
+// get_countries(recs) =>
+//
+// Set("", Poland, Lebanon, Trinidad and
+// Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong
+// Kong, Italy, Ireland, Uganda, England, Bangladesh, China,
+// Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,
+// Malaysia, Turkey, Portugal, Hungary)
+//
+// get_countries(recs) =>
+//
+// Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,
+// Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,
+// Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,
+// England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,
+// Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,
+// Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,
+// Hungary -> 2)
+
+
+//=============
+
+// (5) The function ordered_thread_sizes orders the message threads
+// according to how many answers were given for one message (that is
+// how many children, grand-children and so on one message has).
+//
+// The auxiliary function search enumerates all children,
+// grand-children and so on for a given record r (including the record
+// itself). Search returns the children and so on as a list of Recs.
+//
+// The function thread_size generates for a record, say r, a pair
+// consisting of the number of r and the number of all children as
+// produced by search.
+//
+// The function ordered_thread_sizes orders than the list of pairs
+// according to which thread in the chat is the longest.
+
+def search(r: Rec, rs: List[Rec]) : List[Rec] = ...
+
+def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) = ...
+
+def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = ...
+
+
+// test cases:
+//
+//val recs_p = post_process(get_csv("log.csv").map(process_line))
+//
+//search(recs_p(459), recs_p).map(_.num)
+// => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)
+//
+//thread_size(recs_p(459), recs_p)
+// => (459,10)
+//
+//ordered_thread_sizes(recs_p).take(4)
+// => List((402,18), (95,12), (488,11), (459,10))
+
+
+}