solutions-resit/resit.scala
changeset 486 9c03b5e89a2a
parent 485 19b75e899d37
child 487 efad9725dfd8
--- a/solutions-resit/resit.scala	Fri Apr 26 17:29:30 2024 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,203 +0,0 @@
-// Resit Exam about data-mining a chat log
-//=========================================
-//
-// This coursework is about mining a log of an online chat between 85
-// participants. The log is given as a csv-list in the file
-// log.csv. The log is an unordered list containing information which
-// message has been sent, by whom, when and in response to which other
-// message. Each message has also a number and a unique hash code.
-//
-// !! For further information abiout the tasks, see:     !!
-// !!                                                    !!
-// !! https://nms.kcl.ac.uk/christian.urban/cw-resit.pdf !!
-
-object Resit {
-
-import io.Source
-import scala.util._
-
-//=============
-// (1) The function get_csv takes file name as argument. It should read
-// the corresponding file and return its content. The content should
-// be returned as a list of strings, a string for each line in the
-// file. Since the file is a csv-file, the first line (the header)
-// should be dropped. Lines are separated by "\n".
-
-
-def get_csv(name: String) : List[String] = ...
-
-// test cases:
-//
-// get_csv("log.csv")
-// get_csv("log.csv").length  // should be 680
-
-
-//=============
-// (2) The function below takes a single line from the csv-file (as
-// generated by get_csv) and creates a Rec(ord) data structure. The
-// data from the csv-file should be copied as follows:
-//
-//     csv-file         Rec data structure
-//     -----------------------------------
-//      counter      => num
-//      id           => msg_id 
-//      time_date    => date
-//      name         => author
-//      country      => country (should be None if no country is given)
-//      parent_id    => reply_id (should be None if there is no parent)
-//      msg          => msg
-//                   => parent is set to None  (will be calculated later)
-//                   => children is set to Nil (will be calculated later)
-//
-//     You should use the function line.split(",").toList to separate
-//     the items in the csv-line. BE CAREFUL that the message text in 
-//     the last field can contain commas and therefore the split will not
-//     always result in a list of 7 elements. You need to concatenate
-//     anything beyond the 7th field into a single string for the field msg.
-
-case class Rec(num: Int, 
-               msg_id: String,
-               date: String,
-               msg: String,
-               author: String,
-               country: Option[String],
-               reply_id : Option[String],
-               parent: Option[Int] = None,
-               children: List[Int] = Nil)  
-
-
-
-def process_line(line: String) : Rec = ...
-
-
-// test cases:
-//
-// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")
-// 
-//      ==>  Rec(0,
-//               "5ebeb459ac278d01301f1497",
-//               "2020-05-15T15:25:13.413000",
-//               "this question please?",
-//               "participant34",
-//               Some("United Kingdom"),
-//               Some("5ebea6424923321d63155796"),
-//               None,
-//               List())
-//
-// process_line("""1,hash,date,p43,,,foo, bar""")
-//
-//      ==>  Rec(1, "hash", "date", "foo, bar",
-//               "p43", None, None, None, List())
-//
-// (Note that in the second test case the message needs to be "foo, bar")
-
-
-//=============
-// (3) Each record in the log contains a unique hash code
-//     identifying each message. Some messages also contain a hash
-//     code identifying the parent message (to which question they reply).
-//     The function post_process fills in the information about
-//     potential children and a potential parent message. 
-//  
-//     The auxiliary function get_children takes a record e and a
-//     record list rs as arguments, and returns the list of all direct
-//     children (which have the hash code of e as reply_id). The list
-//     of children are returned as a list of nums.
-//      
-//     The auxiliary function get_parent returns the number of the
-//     record corresponding to the reply_id (if there exists one,
-//     otherwise returns None).
-
-def get_children(e: Rec, rs: List[Rec]) : List[Int] = ...
-
-def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = ...
-
-def post_process(rs: List[Rec]) : List[Rec] = ...
-
-
-// test cases:
-//
-//val recs = get_csv("log.csv").map(process_line)
-//
-//post_process(recs)(4).children   // List(12)
-//post_process(recs)(23).children  // List(16,26)
-//
-//post_process(recs)(8).parent     // None
-//post_process(recs)(9).parent     // Some(7)
-//post_process(recs)(16).parent    // Some(23)
-//post_process(recs)(26).parent    // Some(23)
-
-
-//=============
-// (4) The next two functions calculate the countries where
-//     message authors are coming from and how many authors
-//     come from each country (returned as a Map from countries
-//     to numbers). In case an author did not specify a country,
-//     the empty string is returned.
-
-def get_countries(rs: List[Rec]) : Set[String] = ...
-  
-def get_countries_numbers(rs: List[Rec]) :  Map[String, Int] = ...
-
-// test cases:
-//
-//val recs = get_csv("log.csv").map(process_line)
-//
-// get_countries(recs) => 
-//
-//    Set("", Poland, Lebanon, Trinidad and
-//        Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong
-//        Kong, Italy, Ireland, Uganda, England, Bangladesh, China,
-//        Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,
-//        Malaysia, Turkey, Portugal, Hungary)
-//
-// get_countries_numbers(recs) => 
-//
-//    Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,
-//        Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,
-//        Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,
-//        England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,
-//        Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,
-//        Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,
-//        Hungary -> 2)
-
-
-//============= 
-
-// (5) The function ordered_thread_sizes orders the message threads
-// according to how many answers were given for one message (that is
-// how many children, grand-children and so on one message has).
-//
-// The auxiliary function search enumerates all children,
-// grand-children and so on for a given record r (including the record
-// itself). Search returns the children and so on as a list of Recs.
-//
-// The function thread_size generates for a record, say r, a pair
-// consisting of the number of r and the number of all children as
-// produced by search. 
-//
-// The function ordered_thread_sizes orders than the list of pairs
-// according to which thread in the chat is the longest.
-
-def search(r: Rec, rs: List[Rec]) : List[Rec] = ...
-  
-def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) = ...
- 
-def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = ...
- 
-
-// test cases: 
-//
-//val recs_p = post_process(get_csv("log.csv").map(process_line))
-//
-//search(recs_p(459), recs_p).map(_.num)
-//    => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)
-//
-//thread_size(recs_p(459), recs_p) 
-//    => (459,10)
-//
-//ordered_thread_sizes(recs_p).take(4)
-//    => List((402,18), (95,12), (488,11), (459,10))
-
-
-}