| 336 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      1 | // Resit Exam about data-mining a chat log
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      2 | //=========================================
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      3 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      4 | // This 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      5 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      6 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      7 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      8 | object Resit {
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      9 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     10 | import io.Source
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     11 | import scala.util._
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     12 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     13 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     14 | // (1) The function below takes file name as argument.  It should read
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     15 | //     the corresponding file and return its content. The content
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     16 | //     should be returned as a list of strings, a string for each line
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     17 | //     in the file. Since the file is a csv-file, the first line
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     18 | //     should be dropped. Lines are separated by "\n".
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     19 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     20 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     21 | def get_csv(name: String) : List[String] = {
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     22 |   val csv = Source.fromFile(name)("ISO-8859-1")
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     23 |   csv.mkString.split("\n").toList.drop(1)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     24 | }
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     25 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     26 | // test cases:
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     27 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     28 | // get_csv("log.csv")
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     29 | // get_csv("log.csv").length  // should be 680
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     30 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     31 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     32 | // (2) The function below takes a single line from the csv-file (as generated by
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     33 | //     get_csv) and creates a Rec(ord) data structure. The data from the csv-file 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     34 | //     should be copied as follows:
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     35 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     36 | //     csv-file         Rec data structure
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     37 | //     -----------------------------------
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     38 | //      counter      => num
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     39 | //      id           => msg_id 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     40 | //      time_date    => date
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     41 | //      name         => author
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     42 | //      country,     => country (should be None if no country is given)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     43 | //      parent_id    => reply_id (should be None if there is no parent)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     44 | //      msg          => msg
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     45 | //                   => parent is set to None  (will be calculated later)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     46 | //                   => children is set to Nil (will be calculated later)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     47 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     48 | //     You should use the function line.split(",").toList to separate
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     49 | //     the items in the csv-line. BE CAREFUL that the message texts in 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     50 | //     the last field contain commas and therefore the split will not
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     51 | //     always result into a list of 7 elements. You need to concatenate
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     52 | //     anything beyond the 7th field into a string for the field msg.
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     53 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     54 | case class Rec(num: Int, 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     55 |                msg_id: String,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     56 |                date: String,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     57 |                msg: String,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     58 |                author: String,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     59 |                country: Option[String],
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     60 |                reply_id : Option[String],
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     61 |                parent: Option[Int] = None,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     62 |                children: List[Int] = Nil)  
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     63 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     64 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     65 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     66 | def process_line(line: String) : Rec = {
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     67 |   val strs = line.split(",").toList
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     68 |   Rec(num = strs(0).toInt,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     69 |       msg_id = strs(1),
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     70 |       date = strs(2),
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     71 |       author = strs(3),
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     72 |       country = if (strs(4) == "") None else Some(strs(4)),
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     73 |       reply_id = if (strs(5) == "") None else Some(strs(5)),
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     74 |       msg = (for (i <- 6 until strs.length) yield strs(i)).mkString(","))
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     75 | }
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     76 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     77 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     78 | // test cases:
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     79 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     80 | // process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     81 | // 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     82 | //      ==>  Rec(0,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     83 | //               "5ebeb459ac278d01301f1497",
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     84 | //               "2020-05-15T15:25:13.413000",
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     85 | //               "this question please?",
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     86 | //               "participant34",
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     87 | //               Some("United Kingdom"),
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     88 | //               Some("5ebea6424923321d63155796"),
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     89 | //               None,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     90 | //               List())
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     91 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     92 | // process_line("""1,hash,date,p43,,,foo, bar""")
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     93 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     94 | //      ==>  Rec(1, "hash", "date", "foo, bar",
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     95 | //               "p43", None, None, None, List())
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     96 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     97 | // (Note that in the last test case the message needs to be "foo, bar")
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     98 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     99 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    100 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    101 | // (3) Each record in the log contains a unique hash code
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    102 | //     identifying each message. Some messages also contain a hash
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    103 | //     code identifying the parent message (to which question they reply).
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    104 | //     The function post_process fills in the information about
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    105 | //     potential children and a parent message. 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    106 | //  
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    107 | //     The auxiliary function get_children takes a record e and a
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    108 | //     record list rs as arguments, and returns the list of all direct
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    109 | //     children (which have the hash code of e as reply_id. The list
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    110 | //     of children are returned as a list of nums.
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    111 | //      
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    112 | //     The auxiliary function get_parent returns the number of the
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    113 | //     record corresponding to the reply_id (if there exists one,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    114 | //     otherwise returns None).
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    115 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    116 | def get_children(e: Rec, rs: List[Rec]) : List[Int] = {
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    117 |   (rs.filter(r => r != e &&
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    118 |                   Some(e.msg_id) == r.reply_id)).map(_.num)     
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    119 | }
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    120 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    121 | def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = {
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    122 |   (rs.find(r => r != e &&
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    123 |                 Some(r.msg_id) == e.reply_id)).map(_.num)     
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    124 | }
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    125 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    126 | def post_process(rs: List[Rec]) : List[Rec] =
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    127 |   rs.map(r => r.copy(parent = get_parent(r, rs),
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    128 | 		     children = get_children(r, rs)))
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    129 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    130 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    131 | // test cases:
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    132 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    133 | //val recs = get_csv("log.csv").map(process_line)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    134 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    135 | //post_process(recs)(4).children   // List(12)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    136 | //post_process(recs)(23).children  // List(16,26)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    137 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    138 | //post_process(recs)(8).parent     // None
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    139 | //post_process(recs)(9).parent     // Some(7)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    140 | //post_process(recs)(16).parent    // Some(23)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    141 | //post_process(recs)(26).parent    // Some(23)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    142 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    143 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    144 | // (4) The next two functions calculate the countries where
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    145 | //     message authors are coming from and how many authors
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    146 | //     come from each country (returned as a Map from countries
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    147 | //     to numbers). In case an author did not specify a country,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    148 | //     the empty string is returned.
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    149 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    150 | def get_countries(rs: List[Rec]) : Set[String] =
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    151 |   rs.map(_.country.getOrElse("")).toSet
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    152 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    153 | def get_countries_numbers(rs: List[Rec]) :  Map[String, Int] = {
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    154 |   val name_countries = rs.map(r => (r.author, r.country.getOrElse(""))).distinct
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    155 |   name_countries.groupBy(_._2).view.mapValues(_.size).toMap
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    156 | }
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    157 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    158 | // test cases:
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    159 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    160 | //val recs = get_csv("log.csv").map(process_line)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    161 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    162 | // get_countries(recs) => 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    163 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    164 | //    Set("", Poland, Lebanon, Trinidad and
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    165 | //        Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    166 | //        Kong, Italy, Ireland, Uganda, England, Bangladesh, China,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    167 | //        Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    168 | //        Malaysia, Turkey, Portugal, Hungary)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    169 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    170 | // get_countries(recs) => 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    171 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    172 | //    Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    173 | //        Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    174 | //        Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    175 | //        England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    176 | //        Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    177 | //        Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    178 | //        Hungary -> 2)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    179 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    180 |  
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    181 | // (5) The function thread_sizes orders the message threads according to 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    182 | //     how many answers were given for one message (that is how many children, 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    183 | //     grand-children and so on one message received).
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    184 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    185 | //     The auxiliary function search enumerates all children, grand-children and
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    186 | //     so on for a given record r. Search returns the children and so on as
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    187 | //     a list of Recs. 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    188 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    189 | //     The function thread_sizes generates for every message record
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    190 | //     a pair
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    191 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    192 | def search(r: Rec, rs: List[Rec]) : List[Rec] = 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    193 |   r :: (r.children.map(c => search(rs(c), rs)).flatten)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    194 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    195 | def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) =
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    196 |   (r.num, search(r, rs).size)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    197 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    198 | def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    199 |   rs.map(thread_size(_, rs)).sortBy(_._2).reverse
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    200 |  
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    201 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    202 | // test cases: 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    203 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    204 | //val recs_p = post_process(get_csv("log.csv").map(process_line))
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    205 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    206 | //search(recs_p(459), recs_p).map(_.num)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    207 | //    => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    208 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    209 | //thread_size(recs_p(459), recs_p) 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    210 | //    => (459,10)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    211 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    212 | //ordered_thread_sizes(recs_p).take(4)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    213 | //    => List((402,18), (95,12), (488,11), (459,10))
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    214 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    215 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    216 | }
 |