1 // Resit Exam about data-mining a chat log |
|
2 //========================================= |
|
3 // |
|
4 // This |
|
5 // |
|
6 |
|
7 |
|
8 object Resit { |
|
9 |
|
10 import io.Source |
|
11 import scala.util._ |
|
12 |
|
13 |
|
14 // (1) The function below takes file name as argument. It should read |
|
15 // the corresponding file and return its content. The content |
|
16 // should be returned as a list of strings, a string for each line |
|
17 // in the file. Since the file is a csv-file, the first line |
|
18 // should be dropped. Lines are separated by "\n". |
|
19 |
|
20 |
|
21 def get_csv(name: String) : List[String] = { |
|
22 val csv = Source.fromFile(name)("ISO-8859-1") |
|
23 csv.mkString.split("\n").toList.drop(1) |
|
24 } |
|
25 |
|
26 // test cases: |
|
27 // |
|
28 // get_csv("log.csv") |
|
29 // get_csv("log.csv").length // should be 680 |
|
30 |
|
31 |
|
32 // (2) The function below takes a single line from the csv-file (as generated by |
|
33 // get_csv) and creates a Rec(ord) data structure. The data from the csv-file |
|
34 // should be copied as follows: |
|
35 // |
|
36 // csv-file Rec data structure |
|
37 // ----------------------------------- |
|
38 // counter => num |
|
39 // id => msg_id |
|
40 // time_date => date |
|
41 // name => author |
|
42 // country, => country (should be None if no country is given) |
|
43 // parent_id => reply_id (should be None if there is no parent) |
|
44 // msg => msg |
|
45 // => parent is set to None (will be calculated later) |
|
46 // => children is set to Nil (will be calculated later) |
|
47 // |
|
48 // You should use the function line.split(",").toList to separate |
|
49 // the items in the csv-line. BE CAREFUL that the message texts in |
|
50 // the last field contain commas and therefore the split will not |
|
51 // always result into a list of 7 elements. You need to concatenate |
|
52 // anything beyond the 7th field into a string for the field msg. |
|
53 |
|
54 case class Rec(num: Int, |
|
55 msg_id: String, |
|
56 date: String, |
|
57 msg: String, |
|
58 author: String, |
|
59 country: Option[String], |
|
60 reply_id : Option[String], |
|
61 parent: Option[Int] = None, |
|
62 children: List[Int] = Nil) |
|
63 |
|
64 |
|
65 |
|
66 def process_line(line: String) : Rec = { |
|
67 val strs = line.split(",").toList |
|
68 Rec(num = strs(0).toInt, |
|
69 msg_id = strs(1), |
|
70 date = strs(2), |
|
71 author = strs(3), |
|
72 country = if (strs(4) == "") None else Some(strs(4)), |
|
73 reply_id = if (strs(5) == "") None else Some(strs(5)), |
|
74 msg = (for (i <- 6 until strs.length) yield strs(i)).mkString(",")) |
|
75 } |
|
76 |
|
77 |
|
78 // test cases: |
|
79 // |
|
80 // process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""") |
|
81 // |
|
82 // ==> Rec(0, |
|
83 // "5ebeb459ac278d01301f1497", |
|
84 // "2020-05-15T15:25:13.413000", |
|
85 // "this question please?", |
|
86 // "participant34", |
|
87 // Some("United Kingdom"), |
|
88 // Some("5ebea6424923321d63155796"), |
|
89 // None, |
|
90 // List()) |
|
91 // |
|
92 // process_line("""1,hash,date,p43,,,foo, bar""") |
|
93 // |
|
94 // ==> Rec(1, "hash", "date", "foo, bar", |
|
95 // "p43", None, None, None, List()) |
|
96 // |
|
97 // (Note that in the last test case the message needs to be "foo, bar") |
|
98 |
|
99 |
|
100 |
|
101 // (3) Each record in the log contains a unique hash code |
|
102 // identifying each message. Some messages also contain a hash |
|
103 // code identifying the parent message (to which question they reply). |
|
104 // The function post_process fills in the information about |
|
105 // potential children and a parent message. |
|
106 // |
|
107 // The auxiliary function get_children takes a record e and a |
|
108 // record list rs as arguments, and returns the list of all direct |
|
109 // children (which have the hash code of e as reply_id. The list |
|
110 // of children are returned as a list of nums. |
|
111 // |
|
112 // The auxiliary function get_parent returns the number of the |
|
113 // record corresponding to the reply_id (if there exists one, |
|
114 // otherwise returns None). |
|
115 |
|
116 def get_children(e: Rec, rs: List[Rec]) : List[Int] = { |
|
117 (rs.filter(r => r != e && |
|
118 Some(e.msg_id) == r.reply_id)).map(_.num) |
|
119 } |
|
120 |
|
121 def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = { |
|
122 (rs.find(r => r != e && |
|
123 Some(r.msg_id) == e.reply_id)).map(_.num) |
|
124 } |
|
125 |
|
126 def post_process(rs: List[Rec]) : List[Rec] = |
|
127 rs.map(r => r.copy(parent = get_parent(r, rs), |
|
128 children = get_children(r, rs))) |
|
129 |
|
130 |
|
131 // test cases: |
|
132 // |
|
133 //val recs = get_csv("log.csv").map(process_line) |
|
134 // |
|
135 //post_process(recs)(4).children // List(12) |
|
136 //post_process(recs)(23).children // List(16,26) |
|
137 // |
|
138 //post_process(recs)(8).parent // None |
|
139 //post_process(recs)(9).parent // Some(7) |
|
140 //post_process(recs)(16).parent // Some(23) |
|
141 //post_process(recs)(26).parent // Some(23) |
|
142 |
|
143 |
|
144 // (4) The next two functions calculate the countries where |
|
145 // message authors are coming from and how many authors |
|
146 // come from each country (returned as a Map from countries |
|
147 // to numbers). In case an author did not specify a country, |
|
148 // the empty string is returned. |
|
149 |
|
150 def get_countries(rs: List[Rec]) : Set[String] = |
|
151 rs.map(_.country.getOrElse("")).toSet |
|
152 |
|
153 def get_countries_numbers(rs: List[Rec]) : Map[String, Int] = { |
|
154 val name_countries = rs.map(r => (r.author, r.country.getOrElse(""))).distinct |
|
155 name_countries.groupBy(_._2).view.mapValues(_.size).toMap |
|
156 } |
|
157 |
|
158 // test cases: |
|
159 // |
|
160 //val recs = get_csv("log.csv").map(process_line) |
|
161 // |
|
162 // get_countries(recs) => |
|
163 // |
|
164 // Set("", Poland, Lebanon, Trinidad and |
|
165 // Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong |
|
166 // Kong, Italy, Ireland, Uganda, England, Bangladesh, China, |
|
167 // Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia, |
|
168 // Malaysia, Turkey, Portugal, Hungary) |
|
169 // |
|
170 // get_countries_numbers(recs) => |
|
171 // |
|
172 // Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2, |
|
173 // Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6, |
|
174 // Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1, |
|
175 // England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3, |
|
176 // Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2, |
|
177 // Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1, |
|
178 // Hungary -> 2) |
|
179 |
|
180 |
|
181 // (5) The function thread_sizes orders the message threads according to |
|
182 // how many answers were given for one message (that is how many children, |
|
183 // grand-children and so on one message received). |
|
184 // |
|
185 // The auxiliary function search enumerates all children, grand-children and |
|
186 // so on for a given record r. Search returns the children and so on as |
|
187 // a list of Recs. |
|
188 // |
|
189 // The function thread_sizes generates for every message record |
|
190 // a pair |
|
191 |
|
192 def search(r: Rec, rs: List[Rec]) : List[Rec] = |
|
193 r :: (r.children.map(c => search(rs(c), rs)).flatten) |
|
194 |
|
195 def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) = |
|
196 (r.num, search(r, rs).size) |
|
197 |
|
198 def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = |
|
199 rs.map(thread_size(_, rs)).sortBy(_._2).reverse |
|
200 |
|
201 |
|
202 // test cases: |
|
203 // |
|
204 //val recs_p = post_process(get_csv("log.csv").map(process_line)) |
|
205 // |
|
206 //search(recs_p(459), recs_p).map(_.num) |
|
207 // => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304) |
|
208 // |
|
209 //thread_size(recs_p(459), recs_p) |
|
210 // => (459,10) |
|
211 // |
|
212 //ordered_thread_sizes(recs_p).take(4) |
|
213 // => List((402,18), (95,12), (488,11), (459,10)) |
|
214 |
|
215 |
|
216 } |
|