1 // Resit Exam about data-mining a chat log |
|
2 //========================================= |
|
3 // |
|
4 // This coursework is about mining a log of an online chat between 85 |
|
5 // participants. The log is given as a csv-list in the file |
|
6 // log.csv. The log is an unordered list containing information which |
|
7 // message has been sent, by whom, when and in response to which other |
|
8 // message. Each message has also a number and a unique hash code. |
|
9 // |
|
10 // !! For further information abiout the tasks, see: !! |
|
11 // !! !! |
|
12 // !! https://nms.kcl.ac.uk/christian.urban/cw-resit.pdf !! |
|
13 |
|
14 object Resit { |
|
15 |
|
16 import io.Source |
|
17 import scala.util._ |
|
18 |
|
19 //============= |
|
20 // (1) The function get_csv takes file name as argument. It should read |
|
21 // the corresponding file and return its content. The content should |
|
22 // be returned as a list of strings, a string for each line in the |
|
23 // file. Since the file is a csv-file, the first line (the header) |
|
24 // should be dropped. Lines are separated by "\n". |
|
25 |
|
26 |
|
27 def get_csv(name: String) : List[String] = ... |
|
28 |
|
29 // test cases: |
|
30 // |
|
31 // get_csv("log.csv") |
|
32 // get_csv("log.csv").length // should be 680 |
|
33 |
|
34 |
|
35 //============= |
|
36 // (2) The function below takes a single line from the csv-file (as |
|
37 // generated by get_csv) and creates a Rec(ord) data structure. The |
|
38 // data from the csv-file should be copied as follows: |
|
39 // |
|
40 // csv-file Rec data structure |
|
41 // ----------------------------------- |
|
42 // counter => num |
|
43 // id => msg_id |
|
44 // time_date => date |
|
45 // name => author |
|
46 // country => country (should be None if no country is given) |
|
47 // parent_id => reply_id (should be None if there is no parent) |
|
48 // msg => msg |
|
49 // => parent is set to None (will be calculated later) |
|
50 // => children is set to Nil (will be calculated later) |
|
51 // |
|
52 // You should use the function line.split(",").toList to separate |
|
53 // the items in the csv-line. BE CAREFUL that the message text in |
|
54 // the last field can contain commas and therefore the split will not |
|
55 // always result in a list of 7 elements. You need to concatenate |
|
56 // anything beyond the 7th field into a single string for the field msg. |
|
57 |
|
58 case class Rec(num: Int, |
|
59 msg_id: String, |
|
60 date: String, |
|
61 msg: String, |
|
62 author: String, |
|
63 country: Option[String], |
|
64 reply_id : Option[String], |
|
65 parent: Option[Int] = None, |
|
66 children: List[Int] = Nil) |
|
67 |
|
68 |
|
69 |
|
70 def process_line(line: String) : Rec = ... |
|
71 |
|
72 |
|
73 // test cases: |
|
74 // |
|
75 // process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""") |
|
76 // |
|
77 // ==> Rec(0, |
|
78 // "5ebeb459ac278d01301f1497", |
|
79 // "2020-05-15T15:25:13.413000", |
|
80 // "this question please?", |
|
81 // "participant34", |
|
82 // Some("United Kingdom"), |
|
83 // Some("5ebea6424923321d63155796"), |
|
84 // None, |
|
85 // List()) |
|
86 // |
|
87 // process_line("""1,hash,date,p43,,,foo, bar""") |
|
88 // |
|
89 // ==> Rec(1, "hash", "date", "foo, bar", |
|
90 // "p43", None, None, None, List()) |
|
91 // |
|
92 // (Note that in the second test case the message needs to be "foo, bar") |
|
93 |
|
94 |
|
95 //============= |
|
96 // (3) Each record in the log contains a unique hash code |
|
97 // identifying each message. Some messages also contain a hash |
|
98 // code identifying the parent message (to which question they reply). |
|
99 // The function post_process fills in the information about |
|
100 // potential children and a potential parent message. |
|
101 // |
|
102 // The auxiliary function get_children takes a record e and a |
|
103 // record list rs as arguments, and returns the list of all direct |
|
104 // children (which have the hash code of e as reply_id). The list |
|
105 // of children are returned as a list of nums. |
|
106 // |
|
107 // The auxiliary function get_parent returns the number of the |
|
108 // record corresponding to the reply_id (if there exists one, |
|
109 // otherwise returns None). |
|
110 |
|
111 def get_children(e: Rec, rs: List[Rec]) : List[Int] = ... |
|
112 |
|
113 def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = ... |
|
114 |
|
115 def post_process(rs: List[Rec]) : List[Rec] = ... |
|
116 |
|
117 |
|
118 // test cases: |
|
119 // |
|
120 //val recs = get_csv("log.csv").map(process_line) |
|
121 // |
|
122 //post_process(recs)(4).children // List(12) |
|
123 //post_process(recs)(23).children // List(16,26) |
|
124 // |
|
125 //post_process(recs)(8).parent // None |
|
126 //post_process(recs)(9).parent // Some(7) |
|
127 //post_process(recs)(16).parent // Some(23) |
|
128 //post_process(recs)(26).parent // Some(23) |
|
129 |
|
130 |
|
131 //============= |
|
132 // (4) The next two functions calculate the countries where |
|
133 // message authors are coming from and how many authors |
|
134 // come from each country (returned as a Map from countries |
|
135 // to numbers). In case an author did not specify a country, |
|
136 // the empty string is returned. |
|
137 |
|
138 def get_countries(rs: List[Rec]) : Set[String] = ... |
|
139 |
|
140 def get_countries_numbers(rs: List[Rec]) : Map[String, Int] = ... |
|
141 |
|
142 // test cases: |
|
143 // |
|
144 //val recs = get_csv("log.csv").map(process_line) |
|
145 // |
|
146 // get_countries(recs) => |
|
147 // |
|
148 // Set("", Poland, Lebanon, Trinidad and |
|
149 // Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong |
|
150 // Kong, Italy, Ireland, Uganda, England, Bangladesh, China, |
|
151 // Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia, |
|
152 // Malaysia, Turkey, Portugal, Hungary) |
|
153 // |
|
154 // get_countries_numbers(recs) => |
|
155 // |
|
156 // Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2, |
|
157 // Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6, |
|
158 // Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1, |
|
159 // England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3, |
|
160 // Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2, |
|
161 // Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1, |
|
162 // Hungary -> 2) |
|
163 |
|
164 |
|
165 //============= |
|
166 |
|
167 // (5) The function ordered_thread_sizes orders the message threads |
|
168 // according to how many answers were given for one message (that is |
|
169 // how many children, grand-children and so on one message has). |
|
170 // |
|
171 // The auxiliary function search enumerates all children, |
|
172 // grand-children and so on for a given record r (including the record |
|
173 // itself). Search returns the children and so on as a list of Recs. |
|
174 // |
|
175 // The function thread_size generates for a record, say r, a pair |
|
176 // consisting of the number of r and the number of all children as |
|
177 // produced by search. |
|
178 // |
|
179 // The function ordered_thread_sizes orders than the list of pairs |
|
180 // according to which thread in the chat is the longest. |
|
181 |
|
182 def search(r: Rec, rs: List[Rec]) : List[Rec] = ... |
|
183 |
|
184 def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) = ... |
|
185 |
|
186 def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] = ... |
|
187 |
|
188 |
|
189 // test cases: |
|
190 // |
|
191 //val recs_p = post_process(get_csv("log.csv").map(process_line)) |
|
192 // |
|
193 //search(recs_p(459), recs_p).map(_.num) |
|
194 // => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304) |
|
195 // |
|
196 //thread_size(recs_p(459), recs_p) |
|
197 // => (459,10) |
|
198 // |
|
199 //ordered_thread_sizes(recs_p).take(4) |
|
200 // => List((402,18), (95,12), (488,11), (459,10)) |
|
201 |
|
202 |
|
203 } |
|