solutions-resit/resit.scala
author Christian Urban <christian.urban@kcl.ac.uk>
Thu, 04 Aug 2022 16:53:38 +0200
changeset 423 554278cd4b70
parent 343 51e25cc30483
permissions -rw-r--r--
updated
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
336
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     1
// Resit Exam about data-mining a chat log
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     2
//=========================================
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     3
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     4
// This coursework is about mining a log of an online chat between 85
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     5
// participants. The log is given as a csv-list in the file
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     6
// log.csv. The log is an unordered list containing information which
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     7
// message has been sent, by whom, when and in response to which other
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     8
// message. Each message has also a number and a unique hash code.
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     9
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    10
// !! For further information abiout the tasks, see:     !!
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    11
// !!                                                    !!
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    12
// !! https://nms.kcl.ac.uk/christian.urban/cw-resit.pdf !!
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    13
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    14
object Resit {
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    15
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    16
import io.Source
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    17
import scala.util._
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    18
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    19
//=============
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    20
// (1) The function get_csv takes file name as argument. It should read
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    21
// the corresponding file and return its content. The content should
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    22
// be returned as a list of strings, a string for each line in the
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    23
// file. Since the file is a csv-file, the first line (the header)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    24
// should be dropped. Lines are separated by "\n".
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    25
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    26
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    27
def get_csv(name: String) : List[String] = ...
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    28
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    29
// test cases:
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    30
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    31
// get_csv("log.csv")
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    32
// get_csv("log.csv").length  // should be 680
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    33
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    34
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    35
//=============
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    36
// (2) The function below takes a single line from the csv-file (as
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    37
// generated by get_csv) and creates a Rec(ord) data structure. The
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    38
// data from the csv-file should be copied as follows:
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    39
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    40
//     csv-file         Rec data structure
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    41
//     -----------------------------------
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    42
//      counter      => num
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    43
//      id           => msg_id 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    44
//      time_date    => date
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    45
//      name         => author
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    46
//      country      => country (should be None if no country is given)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    47
//      parent_id    => reply_id (should be None if there is no parent)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    48
//      msg          => msg
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    49
//                   => parent is set to None  (will be calculated later)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    50
//                   => children is set to Nil (will be calculated later)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    51
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    52
//     You should use the function line.split(",").toList to separate
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    53
//     the items in the csv-line. BE CAREFUL that the message text in 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    54
//     the last field can contain commas and therefore the split will not
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    55
//     always result in a list of 7 elements. You need to concatenate
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    56
//     anything beyond the 7th field into a single string for the field msg.
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    57
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    58
case class Rec(num: Int, 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    59
               msg_id: String,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    60
               date: String,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    61
               msg: String,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    62
               author: String,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    63
               country: Option[String],
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    64
               reply_id : Option[String],
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    65
               parent: Option[Int] = None,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    66
               children: List[Int] = Nil)  
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    67
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    68
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    69
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    70
def process_line(line: String) : Rec = ...
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    71
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    72
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    73
// test cases:
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    74
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    75
// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    76
// 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    77
//      ==>  Rec(0,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    78
//               "5ebeb459ac278d01301f1497",
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    79
//               "2020-05-15T15:25:13.413000",
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    80
//               "this question please?",
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    81
//               "participant34",
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    82
//               Some("United Kingdom"),
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    83
//               Some("5ebea6424923321d63155796"),
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    84
//               None,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    85
//               List())
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    86
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    87
// process_line("""1,hash,date,p43,,,foo, bar""")
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    88
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    89
//      ==>  Rec(1, "hash", "date", "foo, bar",
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    90
//               "p43", None, None, None, List())
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    91
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    92
// (Note that in the second test case the message needs to be "foo, bar")
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    93
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    94
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    95
//=============
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    96
// (3) Each record in the log contains a unique hash code
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    97
//     identifying each message. Some messages also contain a hash
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    98
//     code identifying the parent message (to which question they reply).
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    99
//     The function post_process fills in the information about
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   100
//     potential children and a potential parent message. 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   101
//  
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   102
//     The auxiliary function get_children takes a record e and a
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   103
//     record list rs as arguments, and returns the list of all direct
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   104
//     children (which have the hash code of e as reply_id). The list
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   105
//     of children are returned as a list of nums.
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   106
//      
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   107
//     The auxiliary function get_parent returns the number of the
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   108
//     record corresponding to the reply_id (if there exists one,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   109
//     otherwise returns None).
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   110
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   111
def get_children(e: Rec, rs: List[Rec]) : List[Int] = ...
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   112
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   113
def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = ...
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   114
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   115
def post_process(rs: List[Rec]) : List[Rec] = ...
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   116
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   117
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   118
// test cases:
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   119
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   120
//val recs = get_csv("log.csv").map(process_line)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   121
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   122
//post_process(recs)(4).children   // List(12)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   123
//post_process(recs)(23).children  // List(16,26)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   124
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   125
//post_process(recs)(8).parent     // None
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   126
//post_process(recs)(9).parent     // Some(7)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   127
//post_process(recs)(16).parent    // Some(23)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   128
//post_process(recs)(26).parent    // Some(23)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   129
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   130
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   131
//=============
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   132
// (4) The next two functions calculate the countries where
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   133
//     message authors are coming from and how many authors
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   134
//     come from each country (returned as a Map from countries
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   135
//     to numbers). In case an author did not specify a country,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   136
//     the empty string is returned.
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   137
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   138
def get_countries(rs: List[Rec]) : Set[String] = ...
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   139
  
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   140
def get_countries_numbers(rs: List[Rec]) :  Map[String, Int] = ...
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   141
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   142
// test cases:
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   143
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   144
//val recs = get_csv("log.csv").map(process_line)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   145
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   146
// get_countries(recs) => 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   147
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   148
//    Set("", Poland, Lebanon, Trinidad and
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   149
//        Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   150
//        Kong, Italy, Ireland, Uganda, England, Bangladesh, China,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   151
//        Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   152
//        Malaysia, Turkey, Portugal, Hungary)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   153
//
343
51e25cc30483 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 336
diff changeset
   154
// get_countries_numbers(recs) => 
336
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   155
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   156
//    Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   157
//        Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   158
//        Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   159
//        England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   160
//        Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   161
//        Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   162
//        Hungary -> 2)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   163
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   164
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   165
//============= 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   166
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   167
// (5) The function ordered_thread_sizes orders the message threads
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   168
// according to how many answers were given for one message (that is
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   169
// how many children, grand-children and so on one message has).
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   170
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   171
// The auxiliary function search enumerates all children,
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   172
// grand-children and so on for a given record r (including the record
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   173
// itself). Search returns the children and so on as a list of Recs.
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   174
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   175
// The function thread_size generates for a record, say r, a pair
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   176
// consisting of the number of r and the number of all children as
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   177
// produced by search. 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   178
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   179
// The function ordered_thread_sizes orders than the list of pairs
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   180
// according to which thread in the chat is the longest.
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   181
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   182
def search(r: Rec, rs: List[Rec]) : List[Rec] = ...
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   183
  
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   184
def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) = ...
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   185
 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   186
def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = ...
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   187
 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   188
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   189
// test cases: 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   190
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   191
//val recs_p = post_process(get_csv("log.csv").map(process_line))
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   192
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   193
//search(recs_p(459), recs_p).map(_.num)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   194
//    => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   195
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   196
//thread_size(recs_p(459), recs_p) 
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   197
//    => (459,10)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   198
//
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   199
//ordered_thread_sizes(recs_p).take(4)
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   200
//    => List((402,18), (95,12), (488,11), (459,10))
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   201
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   202
cccdae0fccc7 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   203
}