336
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
     1  | 
// Resit Exam about data-mining a chat log
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
     2  | 
//=========================================
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
     3  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
     4  | 
// This 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
     5  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
     6  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
     7  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
     8  | 
object Resit {
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
     9  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    10  | 
import io.Source
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    11  | 
import scala.util._
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    12  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    13  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    14  | 
// (1) The function below takes file name as argument.  It should read
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    15  | 
//     the corresponding file and return its content. The content
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    16  | 
//     should be returned as a list of strings, a string for each line
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    17  | 
//     in the file. Since the file is a csv-file, the first line
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    18  | 
//     should be dropped. Lines are separated by "\n".
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    19  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    20  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    21  | 
def get_csv(name: String) : List[String] = {
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    22  | 
  val csv = Source.fromFile(name)("ISO-8859-1")
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    23  | 
  csv.mkString.split("\n").toList.drop(1)
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    24  | 
}
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    25  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    26  | 
// test cases:
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    27  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    28  | 
// get_csv("log.csv")
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    29  | 
// get_csv("log.csv").length  // should be 680
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    30  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    31  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    32  | 
// (2) The function below takes a single line from the csv-file (as generated by
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    33  | 
//     get_csv) and creates a Rec(ord) data structure. The data from the csv-file 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    34  | 
//     should be copied as follows:
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    35  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    36  | 
//     csv-file         Rec data structure
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    37  | 
//     -----------------------------------
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    38  | 
//      counter      => num
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    39  | 
//      id           => msg_id 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    40  | 
//      time_date    => date
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    41  | 
//      name         => author
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    42  | 
//      country,     => country (should be None if no country is given)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    43  | 
//      parent_id    => reply_id (should be None if there is no parent)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    44  | 
//      msg          => msg
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    45  | 
//                   => parent is set to None  (will be calculated later)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    46  | 
//                   => children is set to Nil (will be calculated later)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    47  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    48  | 
//     You should use the function line.split(",").toList to separate
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    49  | 
//     the items in the csv-line. BE CAREFUL that the message texts in 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    50  | 
//     the last field contain commas and therefore the split will not
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    51  | 
//     always result into a list of 7 elements. You need to concatenate
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    52  | 
//     anything beyond the 7th field into a string for the field msg.
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    53  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    54  | 
case class Rec(num: Int, 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    55  | 
               msg_id: String,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    56  | 
               date: String,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    57  | 
               msg: String,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    58  | 
               author: String,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    59  | 
               country: Option[String],
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    60  | 
               reply_id : Option[String],
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    61  | 
               parent: Option[Int] = None,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    62  | 
               children: List[Int] = Nil)  
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    63  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    64  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    65  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    66  | 
def process_line(line: String) : Rec = {
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    67  | 
  val strs = line.split(",").toList
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    68  | 
  Rec(num = strs(0).toInt,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    69  | 
      msg_id = strs(1),
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    70  | 
      date = strs(2),
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    71  | 
      author = strs(3),
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    72  | 
      country = if (strs(4) == "") None else Some(strs(4)),
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    73  | 
      reply_id = if (strs(5) == "") None else Some(strs(5)),
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    74  | 
      msg = (for (i <- 6 until strs.length) yield strs(i)).mkString(","))
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    75  | 
}
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    76  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    77  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    78  | 
// test cases:
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    79  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    80  | 
// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    81  | 
// 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    82  | 
//      ==>  Rec(0,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    83  | 
//               "5ebeb459ac278d01301f1497",
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    84  | 
//               "2020-05-15T15:25:13.413000",
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    85  | 
//               "this question please?",
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    86  | 
//               "participant34",
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    87  | 
//               Some("United Kingdom"),
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    88  | 
//               Some("5ebea6424923321d63155796"),
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    89  | 
//               None,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    90  | 
//               List())
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    91  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    92  | 
// process_line("""1,hash,date,p43,,,foo, bar""")
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    93  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    94  | 
//      ==>  Rec(1, "hash", "date", "foo, bar",
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    95  | 
//               "p43", None, None, None, List())
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    96  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    97  | 
// (Note that in the last test case the message needs to be "foo, bar")
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    98  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
    99  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   100  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   101  | 
// (3) Each record in the log contains a unique hash code
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   102  | 
//     identifying each message. Some messages also contain a hash
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   103  | 
//     code identifying the parent message (to which question they reply).
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   104  | 
//     The function post_process fills in the information about
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   105  | 
//     potential children and a parent message. 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   106  | 
//  
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   107  | 
//     The auxiliary function get_children takes a record e and a
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   108  | 
//     record list rs as arguments, and returns the list of all direct
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   109  | 
//     children (which have the hash code of e as reply_id. The list
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   110  | 
//     of children are returned as a list of nums.
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   111  | 
//      
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   112  | 
//     The auxiliary function get_parent returns the number of the
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   113  | 
//     record corresponding to the reply_id (if there exists one,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   114  | 
//     otherwise returns None).
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   115  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   116  | 
def get_children(e: Rec, rs: List[Rec]) : List[Int] = {
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   117  | 
  (rs.filter(r => r != e &&
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   118  | 
                  Some(e.msg_id) == r.reply_id)).map(_.num)     
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   119  | 
}
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   120  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   121  | 
def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = {
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   122  | 
  (rs.find(r => r != e &&
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   123  | 
                Some(r.msg_id) == e.reply_id)).map(_.num)     
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   124  | 
}
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   125  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   126  | 
def post_process(rs: List[Rec]) : List[Rec] =
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   127  | 
  rs.map(r => r.copy(parent = get_parent(r, rs),
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   128  | 
		     children = get_children(r, rs)))
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   129  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   130  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   131  | 
// test cases:
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   132  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   133  | 
//val recs = get_csv("log.csv").map(process_line)
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   134  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   135  | 
//post_process(recs)(4).children   // List(12)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   136  | 
//post_process(recs)(23).children  // List(16,26)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   137  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   138  | 
//post_process(recs)(8).parent     // None
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   139  | 
//post_process(recs)(9).parent     // Some(7)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   140  | 
//post_process(recs)(16).parent    // Some(23)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   141  | 
//post_process(recs)(26).parent    // Some(23)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   142  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   143  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   144  | 
// (4) The next two functions calculate the countries where
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   145  | 
//     message authors are coming from and how many authors
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   146  | 
//     come from each country (returned as a Map from countries
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   147  | 
//     to numbers). In case an author did not specify a country,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   148  | 
//     the empty string is returned.
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   149  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   150  | 
def get_countries(rs: List[Rec]) : Set[String] =
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   151  | 
  rs.map(_.country.getOrElse("")).toSet
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   152  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   153  | 
def get_countries_numbers(rs: List[Rec]) :  Map[String, Int] = {
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   154  | 
  val name_countries = rs.map(r => (r.author, r.country.getOrElse(""))).distinct
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   155  | 
  name_countries.groupBy(_._2).view.mapValues(_.size).toMap
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   156  | 
}
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   157  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   158  | 
// test cases:
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   159  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   160  | 
//val recs = get_csv("log.csv").map(process_line)
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   161  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   162  | 
// get_countries(recs) => 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   163  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   164  | 
//    Set("", Poland, Lebanon, Trinidad and
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   165  | 
//        Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   166  | 
//        Kong, Italy, Ireland, Uganda, England, Bangladesh, China,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   167  | 
//        Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   168  | 
//        Malaysia, Turkey, Portugal, Hungary)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   169  | 
//
  | 
| 
343
 | 
   170  | 
// get_countries_numbers(recs) => 
  | 
336
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   171  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   172  | 
//    Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   173  | 
//        Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   174  | 
//        Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   175  | 
//        England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   176  | 
//        Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   177  | 
//        Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   178  | 
//        Hungary -> 2)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   179  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   180  | 
 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   181  | 
// (5) The function thread_sizes orders the message threads according to 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   182  | 
//     how many answers were given for one message (that is how many children, 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   183  | 
//     grand-children and so on one message received).
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   184  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   185  | 
//     The auxiliary function search enumerates all children, grand-children and
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   186  | 
//     so on for a given record r. Search returns the children and so on as
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   187  | 
//     a list of Recs. 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   188  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   189  | 
//     The function thread_sizes generates for every message record
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   190  | 
//     a pair
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   191  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   192  | 
def search(r: Rec, rs: List[Rec]) : List[Rec] = 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   193  | 
  r :: (r.children.map(c => search(rs(c), rs)).flatten)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   194  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   195  | 
def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) =
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   196  | 
  (r.num, search(r, rs).size)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   197  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   198  | 
def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   199  | 
  rs.map(thread_size(_, rs)).sortBy(_._2).reverse
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   200  | 
 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   201  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   202  | 
// test cases: 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   203  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   204  | 
//val recs_p = post_process(get_csv("log.csv").map(process_line))
 | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   205  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   206  | 
//search(recs_p(459), recs_p).map(_.num)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   207  | 
//    => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   208  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   209  | 
//thread_size(recs_p(459), recs_p) 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   210  | 
//    => (459,10)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   211  | 
//
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   212  | 
//ordered_thread_sizes(recs_p).take(4)
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   213  | 
//    => List((402,18), (95,12), (488,11), (459,10))
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   214  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   215  | 
  | 
Christian Urban <christian.urban@kcl.ac.uk> 
parents:  
diff
changeset
 
 | 
   216  | 
}
  |