336
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
1 |
// Resit Exam about data-mining a chat log
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
2 |
//=========================================
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
3 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
4 |
// This
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
5 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
6 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
7 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
8 |
object Resit {
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
9 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
10 |
import io.Source
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
11 |
import scala.util._
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
12 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
13 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
14 |
// (1) The function below takes file name as argument. It should read
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
15 |
// the corresponding file and return its content. The content
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
16 |
// should be returned as a list of strings, a string for each line
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
17 |
// in the file. Since the file is a csv-file, the first line
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
18 |
// should be dropped. Lines are separated by "\n".
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
19 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
20 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
21 |
def get_csv(name: String) : List[String] = {
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
22 |
val csv = Source.fromFile(name)("ISO-8859-1")
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
23 |
csv.mkString.split("\n").toList.drop(1)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
24 |
}
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
25 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
26 |
// test cases:
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
27 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
28 |
// get_csv("log.csv")
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
29 |
// get_csv("log.csv").length // should be 680
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
30 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
31 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
32 |
// (2) The function below takes a single line from the csv-file (as generated by
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
33 |
// get_csv) and creates a Rec(ord) data structure. The data from the csv-file
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
34 |
// should be copied as follows:
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
35 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
36 |
// csv-file Rec data structure
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
37 |
// -----------------------------------
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
38 |
// counter => num
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
39 |
// id => msg_id
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
40 |
// time_date => date
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
41 |
// name => author
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
42 |
// country, => country (should be None if no country is given)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
43 |
// parent_id => reply_id (should be None if there is no parent)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
44 |
// msg => msg
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
45 |
// => parent is set to None (will be calculated later)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
46 |
// => children is set to Nil (will be calculated later)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
47 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
48 |
// You should use the function line.split(",").toList to separate
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
49 |
// the items in the csv-line. BE CAREFUL that the message texts in
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
50 |
// the last field contain commas and therefore the split will not
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
51 |
// always result into a list of 7 elements. You need to concatenate
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
52 |
// anything beyond the 7th field into a string for the field msg.
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
53 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
54 |
case class Rec(num: Int,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
55 |
msg_id: String,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
56 |
date: String,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
57 |
msg: String,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
58 |
author: String,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
59 |
country: Option[String],
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
60 |
reply_id : Option[String],
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
61 |
parent: Option[Int] = None,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
62 |
children: List[Int] = Nil)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
63 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
64 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
65 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
66 |
def process_line(line: String) : Rec = {
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
67 |
val strs = line.split(",").toList
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
68 |
Rec(num = strs(0).toInt,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
69 |
msg_id = strs(1),
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
70 |
date = strs(2),
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
71 |
author = strs(3),
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
72 |
country = if (strs(4) == "") None else Some(strs(4)),
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
73 |
reply_id = if (strs(5) == "") None else Some(strs(5)),
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
74 |
msg = (for (i <- 6 until strs.length) yield strs(i)).mkString(","))
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
75 |
}
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
76 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
77 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
78 |
// test cases:
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
79 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
80 |
// process_line("""0,5ebeb459ac278d01301f1497,2020-05-15T15:25:13.413000,participant34,United Kingdom,5ebea6424923321d63155796,this question please?""")
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
81 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
82 |
// ==> Rec(0,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
83 |
// "5ebeb459ac278d01301f1497",
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
84 |
// "2020-05-15T15:25:13.413000",
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
85 |
// "this question please?",
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
86 |
// "participant34",
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
87 |
// Some("United Kingdom"),
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
88 |
// Some("5ebea6424923321d63155796"),
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
89 |
// None,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
90 |
// List())
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
91 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
92 |
// process_line("""1,hash,date,p43,,,foo, bar""")
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
93 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
94 |
// ==> Rec(1, "hash", "date", "foo, bar",
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
95 |
// "p43", None, None, None, List())
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
96 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
97 |
// (Note that in the last test case the message needs to be "foo, bar")
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
98 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
99 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
100 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
101 |
// (3) Each record in the log contains a unique hash code
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
102 |
// identifying each message. Some messages also contain a hash
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
103 |
// code identifying the parent message (to which question they reply).
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
104 |
// The function post_process fills in the information about
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
105 |
// potential children and a parent message.
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
106 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
107 |
// The auxiliary function get_children takes a record e and a
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
108 |
// record list rs as arguments, and returns the list of all direct
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
109 |
// children (which have the hash code of e as reply_id. The list
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
110 |
// of children are returned as a list of nums.
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
111 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
112 |
// The auxiliary function get_parent returns the number of the
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
113 |
// record corresponding to the reply_id (if there exists one,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
114 |
// otherwise returns None).
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
115 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
116 |
def get_children(e: Rec, rs: List[Rec]) : List[Int] = {
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
117 |
(rs.filter(r => r != e &&
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
118 |
Some(e.msg_id) == r.reply_id)).map(_.num)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
119 |
}
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
120 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
121 |
def get_parent(e: Rec, rs: List[Rec]) : Option[Int] = {
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
122 |
(rs.find(r => r != e &&
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
123 |
Some(r.msg_id) == e.reply_id)).map(_.num)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
124 |
}
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
125 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
126 |
def post_process(rs: List[Rec]) : List[Rec] =
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
127 |
rs.map(r => r.copy(parent = get_parent(r, rs),
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
128 |
children = get_children(r, rs)))
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
129 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
130 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
131 |
// test cases:
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
132 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
133 |
//val recs = get_csv("log.csv").map(process_line)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
134 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
135 |
//post_process(recs)(4).children // List(12)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
136 |
//post_process(recs)(23).children // List(16,26)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
137 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
138 |
//post_process(recs)(8).parent // None
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
139 |
//post_process(recs)(9).parent // Some(7)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
140 |
//post_process(recs)(16).parent // Some(23)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
141 |
//post_process(recs)(26).parent // Some(23)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
142 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
143 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
144 |
// (4) The next two functions calculate the countries where
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
145 |
// message authors are coming from and how many authors
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
146 |
// come from each country (returned as a Map from countries
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
147 |
// to numbers). In case an author did not specify a country,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
148 |
// the empty string is returned.
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
149 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
150 |
def get_countries(rs: List[Rec]) : Set[String] =
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
151 |
rs.map(_.country.getOrElse("")).toSet
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
152 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
153 |
def get_countries_numbers(rs: List[Rec]) : Map[String, Int] = {
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
154 |
val name_countries = rs.map(r => (r.author, r.country.getOrElse(""))).distinct
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
155 |
name_countries.groupBy(_._2).view.mapValues(_.size).toMap
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
156 |
}
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
157 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
158 |
// test cases:
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
159 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
160 |
//val recs = get_csv("log.csv").map(process_line)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
161 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
162 |
// get_countries(recs) =>
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
163 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
164 |
// Set("", Poland, Lebanon, Trinidad and
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
165 |
// Tobago, Japan, Spain, Nigeria, Peru, India, Lithuania, Hong
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
166 |
// Kong, Italy, Ireland, Uganda, England, Bangladesh, China,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
167 |
// Romania, Slovakia, United Kingdom, Norway, Pakistan, Indonesia,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
168 |
// Malaysia, Turkey, Portugal, Hungary)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
169 |
//
|
343
|
170 |
// get_countries_numbers(recs) =>
|
336
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
171 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
172 |
// Map("" -> 4, Poland -> 2, Lebanon -> 1, Trinidad and Tobago -> 2,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
173 |
// Japan -> 1, Spain -> 1, Nigeria -> 1, Peru -> 1, India -> 6,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
174 |
// Lithuania -> 1, Hong Kong -> 3, Italy -> 1, Ireland -> 2, Uganda -> 1,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
175 |
// England -> 4, Bangladesh -> 2, China -> 4, Romania -> 3,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
176 |
// Slovakia -> 1, United Kingdom -> 34, Norway -> 1, Pakistan -> 2,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
177 |
// Indonesia -> 2, Malaysia -> 1, Turkey -> 1, Portugal -> 1,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
178 |
// Hungary -> 2)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
179 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
180 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
181 |
// (5) The function thread_sizes orders the message threads according to
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
182 |
// how many answers were given for one message (that is how many children,
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
183 |
// grand-children and so on one message received).
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
184 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
185 |
// The auxiliary function search enumerates all children, grand-children and
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
186 |
// so on for a given record r. Search returns the children and so on as
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
187 |
// a list of Recs.
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
188 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
189 |
// The function thread_sizes generates for every message record
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
190 |
// a pair
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
191 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
192 |
def search(r: Rec, rs: List[Rec]) : List[Rec] =
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
193 |
r :: (r.children.map(c => search(rs(c), rs)).flatten)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
194 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
195 |
def thread_size(r: Rec, rs: List[Rec]) : (Int, Int) =
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
196 |
(r.num, search(r, rs).size)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
197 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
198 |
def ordered_thread_sizes(rs: List[Rec]) : List[(Int, Int)] =
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
199 |
rs.map(thread_size(_, rs)).sortBy(_._2).reverse
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
200 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
201 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
202 |
// test cases:
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
203 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
204 |
//val recs_p = post_process(get_csv("log.csv").map(process_line))
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
205 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
206 |
//search(recs_p(459), recs_p).map(_.num)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
207 |
// => List(459, 401, 404, 426, 428, 399, 377, 357, 325, 304)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
208 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
209 |
//thread_size(recs_p(459), recs_p)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
210 |
// => (459,10)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
211 |
//
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
212 |
//ordered_thread_sizes(recs_p).take(4)
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
213 |
// => List((402,18), (95,12), (488,11), (459,10))
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
214 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
215 |
|
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
|
216 |
}
|