solutions-resit/resit-sol.scala
author Christian Urban <christian.urban@kcl.ac.uk>
Sat, 28 Nov 2020 15:58:36 +0000
changeset 378 7a5ad01a85b5
parent 343 c8fcc0e0a57f
permissions -rw-r--r--
updated
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
336
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     1
// Resit Exam about data-mining a chat log
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     2
//=========================================
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     3
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     4
// This 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     5
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     6
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     7
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     8
object Resit {
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
     9
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    10
import io.Source
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    11
import scala.util._
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    12
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    13
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    14
// (1) The function below takes file name as argument.  It should read
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    15
//     the corresponding file and return its content. The content
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    16
//     should be returned as a list of strings, a string for each line
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    17
//     in the file. Since the file is a csv-file, the first line
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    18
//     should be dropped. Lines are separated by "\n".
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    19
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    20
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    21
def get_csv(name: String) : List[String] = {
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    22
  val csv = Source.fromFile(name)("ISO-8859-1")
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    23
  csv.mkString.split("\n").toList.drop(1)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    24
}
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    25
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    26
// test cases:
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    27
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    28
// get_csv("log.csv")
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    29
// get_csv("log.csv").length  // should be 680
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    30
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    31
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    32
// (2) The function below takes a single line from the csv-file (as generated by
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    33
//     get_csv) and creates a Rec(ord) data structure. The data from the csv-file 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    34
//     should be copied as follows:
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    35
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    36
//     csv-file         Rec data structure
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    37
//     -----------------------------------
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    38
//      counter      => num
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    39
//      id           => msg_id 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    40
//      time_date    => date
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    41
//      name         => author
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    42
//      country,     => country (should be None if no country is given)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    43
//      parent_id    => reply_id (should be None if there is no parent)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    44
//      msg          => msg
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    45
//                   => parent is set to None  (will be calculated later)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    46
//                   => children is set to Nil (will be calculated later)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    47
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    48
//     You should use the function line.split(",").toList to separate
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    49
//     the items in the csv-line. BE CAREFUL that the message texts in 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    50
//     the last field contain commas and therefore the split will not
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    51
//     always result into a list of 7 elements. You need to concatenate
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    52
//     anything beyond the 7th field into a string for the field msg.
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    53
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    54
case class Rec(num: Int, 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    55
               msg_id: String,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    56
               date: String,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    57
               msg: String,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    58
               author: String,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    59
               country: Option[String],
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    60
               reply_id : Option[String],
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    61
               parent: Option[Int] = None,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    62
               children: List[Int] = Nil)  
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    63
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    64
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    65
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    66
def process_line(line: String) : Rec = {
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    67
  val strs = line.split(",").toList
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    68
  Rec(num = strs(0).toInt,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    69
      msg_id = strs(1),
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    70
      date = strs(2),
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    71
      author = strs(3),
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    72
      country = if (strs(4) == "") None else Some(strs(4)),
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    73
      reply_id = if (strs(5) == "") None else Some(strs(5)),
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    74
      msg = (for (i <- 6 until strs.length) yield strs(i)).mkString(","))
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    75
}
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    76
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    77
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    78
// test cases:
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    79
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    80
// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    81
// 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    82
//      ==>  Rec(0,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    83
//               "5ebeb459ac278d01301f1497",
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    84
//               "2020-05-15T15:25:13.413000",
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    85
//               "this question please?",
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    86
//               "participant34",
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    87
//               Some("United Kingdom"),
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    88
//               Some("5ebea6424923321d63155796"),
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    89
//               None,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    90
//               List())
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    91
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    92
// process_line("""1,hash,date,p43,,,foo, bar""")
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    93
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    94
//      ==>  Rec(1, "hash", "date", "foo, bar",
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    95
//               "p43", None, None, None, List())
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    96
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    97
// (Note that in the last test case the message needs to be "foo, bar")
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    98
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
    99
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   100
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   101
// (3) Each record in the log contains a unique hash code
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   102
//     identifying each message. Some messages also contain a hash
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   103
//     code identifying the parent message (to which question they reply).
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   104
//     The function post_process fills in the information about
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   105
//     potential children and a parent message. 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   106
//  
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   107
//     The auxiliary function get_children takes a record e and a
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   108
//     record list rs as arguments, and returns the list of all direct
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   109
//     children (which have the hash code of e as reply_id. The list
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   110
//     of children are returned as a list of nums.
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   111
//      
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   112
//     The auxiliary function get_parent returns the number of the
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   113
//     record corresponding to the reply_id (if there exists one,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   114
//     otherwise returns None).
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   115
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   116
def get_children(e: Rec, rs: List[Rec]) : List[Int] = {
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   117
  (rs.filter(r => r != e &&
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   118
                  Some(e.msg_id) == r.reply_id)).map(_.num)     
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   119
}
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   120
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   121
def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = {
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   122
  (rs.find(r => r != e &&
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   123
                Some(r.msg_id) == e.reply_id)).map(_.num)     
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   124
}
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   125
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   126
def post_process(rs: List[Rec]) : List[Rec] =
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   127
  rs.map(r => r.copy(parent = get_parent(r, rs),
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   128
		     children = get_children(r, rs)))
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   129
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   130
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   131
// test cases:
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   132
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   133
//val recs = get_csv("log.csv").map(process_line)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   134
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   135
//post_process(recs)(4).children   // List(12)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   136
//post_process(recs)(23).children  // List(16,26)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   137
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   138
//post_process(recs)(8).parent     // None
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   139
//post_process(recs)(9).parent     // Some(7)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   140
//post_process(recs)(16).parent    // Some(23)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   141
//post_process(recs)(26).parent    // Some(23)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   142
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   143
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   144
// (4) The next two functions calculate the countries where
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   145
//     message authors are coming from and how many authors
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   146
//     come from each country (returned as a Map from countries
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   147
//     to numbers). In case an author did not specify a country,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   148
//     the empty string is returned.
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   149
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   150
def get_countries(rs: List[Rec]) : Set[String] =
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   151
  rs.map(_.country.getOrElse("")).toSet
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   152
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   153
def get_countries_numbers(rs: List[Rec]) :  Map[String, Int] = {
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   154
  val name_countries = rs.map(r => (r.author, r.country.getOrElse(""))).distinct
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   155
  name_countries.groupBy(_._2).view.mapValues(_.size).toMap
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   156
}
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   157
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   158
// test cases:
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   159
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   160
//val recs = get_csv("log.csv").map(process_line)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   161
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   162
// get_countries(recs) => 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   163
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   164
//    Set("", Poland, Lebanon, Trinidad and
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   165
//        Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   166
//        Kong, Italy, Ireland, Uganda, England, Bangladesh, China,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   167
//        Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   168
//        Malaysia, Turkey, Portugal, Hungary)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   169
//
343
c8fcc0e0a57f updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 336
diff changeset
   170
// get_countries_numbers(recs) => 
336
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   171
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   172
//    Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   173
//        Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   174
//        Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   175
//        England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   176
//        Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   177
//        Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   178
//        Hungary -> 2)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   179
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   180
 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   181
// (5) The function thread_sizes orders the message threads according to 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   182
//     how many answers were given for one message (that is how many children, 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   183
//     grand-children and so on one message received).
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   184
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   185
//     The auxiliary function search enumerates all children, grand-children and
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   186
//     so on for a given record r. Search returns the children and so on as
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   187
//     a list of Recs. 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   188
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   189
//     The function thread_sizes generates for every message record
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   190
//     a pair
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   191
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   192
def search(r: Rec, rs: List[Rec]) : List[Rec] = 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   193
  r :: (r.children.map(c => search(rs(c), rs)).flatten)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   194
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   195
def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) =
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   196
  (r.num, search(r, rs).size)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   197
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   198
def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   199
  rs.map(thread_size(_, rs)).sortBy(_._2).reverse
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   200
 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   201
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   202
// test cases: 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   203
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   204
//val recs_p = post_process(get_csv("log.csv").map(process_line))
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   205
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   206
//search(recs_p(459), recs_p).map(_.num)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   207
//    => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   208
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   209
//thread_size(recs_p(459), recs_p) 
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   210
//    => (459,10)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   211
//
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   212
//ordered_thread_sizes(recs_p).take(4)
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   213
//    => List((402,18), (95,12), (488,11), (459,10))
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   214
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   215
25d9c3b2bc99 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff changeset
   216
}