solutions-resit/resit-sol.scala
changeset 486 9c03b5e89a2a
parent 485 19b75e899d37
child 487 efad9725dfd8
equal deleted inserted replaced
485:19b75e899d37 486:9c03b5e89a2a
     1 // Resit Exam about data-mining a chat log
       
     2 //=========================================
       
     3 //
       
     4 // This 
       
     5 //
       
     6 
       
     7 
       
     8 object Resit {
       
     9 
       
    10 import io.Source
       
    11 import scala.util._
       
    12 
       
    13 
       
    14 // (1) The function below takes file name as argument.  It should read
       
    15 //     the corresponding file and return its content. The content
       
    16 //     should be returned as a list of strings, a string for each line
       
    17 //     in the file. Since the file is a csv-file, the first line
       
    18 //     should be dropped. Lines are separated by "\n".
       
    19 
       
    20 
       
    21 def get_csv(name: String) : List[String] = {
       
    22   val csv = Source.fromFile(name)("ISO-8859-1")
       
    23   csv.mkString.split("\n").toList.drop(1)
       
    24 }
       
    25 
       
    26 // test cases:
       
    27 //
       
    28 // get_csv("log.csv")
       
    29 // get_csv("log.csv").length  // should be 680
       
    30 
       
    31 
       
    32 // (2) The function below takes a single line from the csv-file (as generated by
       
    33 //     get_csv) and creates a Rec(ord) data structure. The data from the csv-file 
       
    34 //     should be copied as follows:
       
    35 //
       
    36 //     csv-file         Rec data structure
       
    37 //     -----------------------------------
       
    38 //      counter      => num
       
    39 //      id           => msg_id 
       
    40 //      time_date    => date
       
    41 //      name         => author
       
    42 //      country,     => country (should be None if no country is given)
       
    43 //      parent_id    => reply_id (should be None if there is no parent)
       
    44 //      msg          => msg
       
    45 //                   => parent is set to None  (will be calculated later)
       
    46 //                   => children is set to Nil (will be calculated later)
       
    47 //
       
    48 //     You should use the function line.split(",").toList to separate
       
    49 //     the items in the csv-line. BE CAREFUL that the message texts in 
       
    50 //     the last field contain commas and therefore the split will not
       
    51 //     always result into a list of 7 elements. You need to concatenate
       
    52 //     anything beyond the 7th field into a string for the field msg.
       
    53 
       
    54 case class Rec(num: Int, 
       
    55                msg_id: String,
       
    56                date: String,
       
    57                msg: String,
       
    58                author: String,
       
    59                country: Option[String],
       
    60                reply_id : Option[String],
       
    61                parent: Option[Int] = None,
       
    62                children: List[Int] = Nil)  
       
    63 
       
    64 
       
    65 
       
    66 def process_line(line: String) : Rec = {
       
    67   val strs = line.split(",").toList
       
    68   Rec(num = strs(0).toInt,
       
    69       msg_id = strs(1),
       
    70       date = strs(2),
       
    71       author = strs(3),
       
    72       country = if (strs(4) == "") None else Some(strs(4)),
       
    73       reply_id = if (strs(5) == "") None else Some(strs(5)),
       
    74       msg = (for (i <- 6 until strs.length) yield strs(i)).mkString(","))
       
    75 }
       
    76 
       
    77 
       
    78 // test cases:
       
    79 //
       
    80 // process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")
       
    81 // 
       
    82 //      ==>  Rec(0,
       
    83 //               "5ebeb459ac278d01301f1497",
       
    84 //               "2020-05-15T15:25:13.413000",
       
    85 //               "this question please?",
       
    86 //               "participant34",
       
    87 //               Some("United Kingdom"),
       
    88 //               Some("5ebea6424923321d63155796"),
       
    89 //               None,
       
    90 //               List())
       
    91 //
       
    92 // process_line("""1,hash,date,p43,,,foo, bar""")
       
    93 //
       
    94 //      ==>  Rec(1, "hash", "date", "foo, bar",
       
    95 //               "p43", None, None, None, List())
       
    96 //
       
    97 // (Note that in the last test case the message needs to be "foo, bar")
       
    98 
       
    99 
       
   100 
       
   101 // (3) Each record in the log contains a unique hash code
       
   102 //     identifying each message. Some messages also contain a hash
       
   103 //     code identifying the parent message (to which question they reply).
       
   104 //     The function post_process fills in the information about
       
   105 //     potential children and a parent message. 
       
   106 //  
       
   107 //     The auxiliary function get_children takes a record e and a
       
   108 //     record list rs as arguments, and returns the list of all direct
       
   109 //     children (which have the hash code of e as reply_id. The list
       
   110 //     of children are returned as a list of nums.
       
   111 //      
       
   112 //     The auxiliary function get_parent returns the number of the
       
   113 //     record corresponding to the reply_id (if there exists one,
       
   114 //     otherwise returns None).
       
   115 
       
   116 def get_children(e: Rec, rs: List[Rec]) : List[Int] = {
       
   117   (rs.filter(r => r != e &&
       
   118                   Some(e.msg_id) == r.reply_id)).map(_.num)     
       
   119 }
       
   120 
       
   121 def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = {
       
   122   (rs.find(r => r != e &&
       
   123                 Some(r.msg_id) == e.reply_id)).map(_.num)     
       
   124 }
       
   125 
       
   126 def post_process(rs: List[Rec]) : List[Rec] =
       
   127   rs.map(r => r.copy(parent = get_parent(r, rs),
       
   128 		     children = get_children(r, rs)))
       
   129 
       
   130 
       
   131 // test cases:
       
   132 //
       
   133 //val recs = get_csv("log.csv").map(process_line)
       
   134 //
       
   135 //post_process(recs)(4).children   // List(12)
       
   136 //post_process(recs)(23).children  // List(16,26)
       
   137 //
       
   138 //post_process(recs)(8).parent     // None
       
   139 //post_process(recs)(9).parent     // Some(7)
       
   140 //post_process(recs)(16).parent    // Some(23)
       
   141 //post_process(recs)(26).parent    // Some(23)
       
   142 
       
   143 
       
   144 // (4) The next two functions calculate the countries where
       
   145 //     message authors are coming from and how many authors
       
   146 //     come from each country (returned as a Map from countries
       
   147 //     to numbers). In case an author did not specify a country,
       
   148 //     the empty string is returned.
       
   149 
       
   150 def get_countries(rs: List[Rec]) : Set[String] =
       
   151   rs.map(_.country.getOrElse("")).toSet
       
   152 
       
   153 def get_countries_numbers(rs: List[Rec]) :  Map[String, Int] = {
       
   154   val name_countries = rs.map(r => (r.author, r.country.getOrElse(""))).distinct
       
   155   name_countries.groupBy(_._2).view.mapValues(_.size).toMap
       
   156 }
       
   157 
       
   158 // test cases:
       
   159 //
       
   160 //val recs = get_csv("log.csv").map(process_line)
       
   161 //
       
   162 // get_countries(recs) => 
       
   163 //
       
   164 //    Set("", Poland, Lebanon, Trinidad and
       
   165 //        Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong
       
   166 //        Kong, Italy, Ireland, Uganda, England, Bangladesh, China,
       
   167 //        Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,
       
   168 //        Malaysia, Turkey, Portugal, Hungary)
       
   169 //
       
   170 // get_countries_numbers(recs) => 
       
   171 //
       
   172 //    Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,
       
   173 //        Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,
       
   174 //        Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,
       
   175 //        England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,
       
   176 //        Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,
       
   177 //        Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,
       
   178 //        Hungary -> 2)
       
   179 
       
   180  
       
   181 // (5) The function thread_sizes orders the message threads according to 
       
   182 //     how many answers were given for one message (that is how many children, 
       
   183 //     grand-children and so on one message received).
       
   184 //
       
   185 //     The auxiliary function search enumerates all children, grand-children and
       
   186 //     so on for a given record r. Search returns the children and so on as
       
   187 //     a list of Recs. 
       
   188 //
       
   189 //     The function thread_sizes generates for every message record
       
   190 //     a pair
       
   191 
       
   192 def search(r: Rec, rs: List[Rec]) : List[Rec] = 
       
   193   r :: (r.children.map(c => search(rs(c), rs)).flatten)
       
   194 
       
   195 def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) =
       
   196   (r.num, search(r, rs).size)
       
   197 
       
   198 def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = 
       
   199   rs.map(thread_size(_, rs)).sortBy(_._2).reverse
       
   200  
       
   201 
       
   202 // test cases: 
       
   203 //
       
   204 //val recs_p = post_process(get_csv("log.csv").map(process_line))
       
   205 //
       
   206 //search(recs_p(459), recs_p).map(_.num)
       
   207 //    => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)
       
   208 //
       
   209 //thread_size(recs_p(459), recs_p) 
       
   210 //    => (459,10)
       
   211 //
       
   212 //ordered_thread_sizes(recs_p).take(4)
       
   213 //    => List((402,18), (95,12), (488,11), (459,10))
       
   214 
       
   215 
       
   216 }