solutions-resit/resit.scala
changeset 486 9c03b5e89a2a
parent 485 19b75e899d37
child 487 efad9725dfd8
equal deleted inserted replaced
485:19b75e899d37 486:9c03b5e89a2a
     1 // Resit Exam about data-mining a chat log
       
     2 //=========================================
       
     3 //
       
     4 // This coursework is about mining a log of an online chat between 85
       
     5 // participants. The log is given as a csv-list in the file
       
     6 // log.csv. The log is an unordered list containing information which
       
     7 // message has been sent, by whom, when and in response to which other
       
     8 // message. Each message has also a number and a unique hash code.
       
     9 //
       
    10 // !! For further information abiout the tasks, see:     !!
       
    11 // !!                                                    !!
       
    12 // !! https://nms.kcl.ac.uk/christian.urban/cw-resit.pdf !!
       
    13 
       
    14 object Resit {
       
    15 
       
    16 import io.Source
       
    17 import scala.util._
       
    18 
       
    19 //=============
       
    20 // (1) The function get_csv takes file name as argument. It should read
       
    21 // the corresponding file and return its content. The content should
       
    22 // be returned as a list of strings, a string for each line in the
       
    23 // file. Since the file is a csv-file, the first line (the header)
       
    24 // should be dropped. Lines are separated by "\n".
       
    25 
       
    26 
       
    27 def get_csv(name: String) : List[String] = ...
       
    28 
       
    29 // test cases:
       
    30 //
       
    31 // get_csv("log.csv")
       
    32 // get_csv("log.csv").length  // should be 680
       
    33 
       
    34 
       
    35 //=============
       
    36 // (2) The function below takes a single line from the csv-file (as
       
    37 // generated by get_csv) and creates a Rec(ord) data structure. The
       
    38 // data from the csv-file should be copied as follows:
       
    39 //
       
    40 //     csv-file         Rec data structure
       
    41 //     -----------------------------------
       
    42 //      counter      => num
       
    43 //      id           => msg_id 
       
    44 //      time_date    => date
       
    45 //      name         => author
       
    46 //      country      => country (should be None if no country is given)
       
    47 //      parent_id    => reply_id (should be None if there is no parent)
       
    48 //      msg          => msg
       
    49 //                   => parent is set to None  (will be calculated later)
       
    50 //                   => children is set to Nil (will be calculated later)
       
    51 //
       
    52 //     You should use the function line.split(",").toList to separate
       
    53 //     the items in the csv-line. BE CAREFUL that the message text in 
       
    54 //     the last field can contain commas and therefore the split will not
       
    55 //     always result in a list of 7 elements. You need to concatenate
       
    56 //     anything beyond the 7th field into a single string for the field msg.
       
    57 
       
    58 case class Rec(num: Int, 
       
    59                msg_id: String,
       
    60                date: String,
       
    61                msg: String,
       
    62                author: String,
       
    63                country: Option[String],
       
    64                reply_id : Option[String],
       
    65                parent: Option[Int] = None,
       
    66                children: List[Int] = Nil)  
       
    67 
       
    68 
       
    69 
       
    70 def process_line(line: String) : Rec = ...
       
    71 
       
    72 
       
    73 // test cases:
       
    74 //
       
    75 // process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")
       
    76 // 
       
    77 //      ==>  Rec(0,
       
    78 //               "5ebeb459ac278d01301f1497",
       
    79 //               "2020-05-15T15:25:13.413000",
       
    80 //               "this question please?",
       
    81 //               "participant34",
       
    82 //               Some("United Kingdom"),
       
    83 //               Some("5ebea6424923321d63155796"),
       
    84 //               None,
       
    85 //               List())
       
    86 //
       
    87 // process_line("""1,hash,date,p43,,,foo, bar""")
       
    88 //
       
    89 //      ==>  Rec(1, "hash", "date", "foo, bar",
       
    90 //               "p43", None, None, None, List())
       
    91 //
       
    92 // (Note that in the second test case the message needs to be "foo, bar")
       
    93 
       
    94 
       
    95 //=============
       
    96 // (3) Each record in the log contains a unique hash code
       
    97 //     identifying each message. Some messages also contain a hash
       
    98 //     code identifying the parent message (to which question they reply).
       
    99 //     The function post_process fills in the information about
       
   100 //     potential children and a potential parent message. 
       
   101 //  
       
   102 //     The auxiliary function get_children takes a record e and a
       
   103 //     record list rs as arguments, and returns the list of all direct
       
   104 //     children (which have the hash code of e as reply_id). The list
       
   105 //     of children are returned as a list of nums.
       
   106 //      
       
   107 //     The auxiliary function get_parent returns the number of the
       
   108 //     record corresponding to the reply_id (if there exists one,
       
   109 //     otherwise returns None).
       
   110 
       
   111 def get_children(e: Rec, rs: List[Rec]) : List[Int] = ...
       
   112 
       
   113 def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = ...
       
   114 
       
   115 def post_process(rs: List[Rec]) : List[Rec] = ...
       
   116 
       
   117 
       
   118 // test cases:
       
   119 //
       
   120 //val recs = get_csv("log.csv").map(process_line)
       
   121 //
       
   122 //post_process(recs)(4).children   // List(12)
       
   123 //post_process(recs)(23).children  // List(16,26)
       
   124 //
       
   125 //post_process(recs)(8).parent     // None
       
   126 //post_process(recs)(9).parent     // Some(7)
       
   127 //post_process(recs)(16).parent    // Some(23)
       
   128 //post_process(recs)(26).parent    // Some(23)
       
   129 
       
   130 
       
   131 //=============
       
   132 // (4) The next two functions calculate the countries where
       
   133 //     message authors are coming from and how many authors
       
   134 //     come from each country (returned as a Map from countries
       
   135 //     to numbers). In case an author did not specify a country,
       
   136 //     the empty string is returned.
       
   137 
       
   138 def get_countries(rs: List[Rec]) : Set[String] = ...
       
   139   
       
   140 def get_countries_numbers(rs: List[Rec]) :  Map[String, Int] = ...
       
   141 
       
   142 // test cases:
       
   143 //
       
   144 //val recs = get_csv("log.csv").map(process_line)
       
   145 //
       
   146 // get_countries(recs) => 
       
   147 //
       
   148 //    Set("", Poland, Lebanon, Trinidad and
       
   149 //        Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong
       
   150 //        Kong, Italy, Ireland, Uganda, England, Bangladesh, China,
       
   151 //        Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,
       
   152 //        Malaysia, Turkey, Portugal, Hungary)
       
   153 //
       
   154 // get_countries_numbers(recs) => 
       
   155 //
       
   156 //    Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,
       
   157 //        Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,
       
   158 //        Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,
       
   159 //        England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,
       
   160 //        Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,
       
   161 //        Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,
       
   162 //        Hungary -> 2)
       
   163 
       
   164 
       
   165 //============= 
       
   166 
       
   167 // (5) The function ordered_thread_sizes orders the message threads
       
   168 // according to how many answers were given for one message (that is
       
   169 // how many children, grand-children and so on one message has).
       
   170 //
       
   171 // The auxiliary function search enumerates all children,
       
   172 // grand-children and so on for a given record r (including the record
       
   173 // itself). Search returns the children and so on as a list of Recs.
       
   174 //
       
   175 // The function thread_size generates for a record, say r, a pair
       
   176 // consisting of the number of r and the number of all children as
       
   177 // produced by search. 
       
   178 //
       
   179 // The function ordered_thread_sizes orders than the list of pairs
       
   180 // according to which thread in the chat is the longest.
       
   181 
       
   182 def search(r: Rec, rs: List[Rec]) : List[Rec] = ...
       
   183   
       
   184 def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) = ...
       
   185  
       
   186 def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = ...
       
   187  
       
   188 
       
   189 // test cases: 
       
   190 //
       
   191 //val recs_p = post_process(get_csv("log.csv").map(process_line))
       
   192 //
       
   193 //search(recs_p(459), recs_p).map(_.num)
       
   194 //    => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)
       
   195 //
       
   196 //thread_size(recs_p(459), recs_p) 
       
   197 //    => (459,10)
       
   198 //
       
   199 //ordered_thread_sizes(recs_p).take(4)
       
   200 //    => List((402,18), (95,12), (488,11), (459,10))
       
   201 
       
   202 
       
   203 }