8 // all words in a string using the regular expression |
8 // all words in a string using the regular expression |
9 // \w+ and the library function |
9 // \w+ and the library function |
10 // |
10 // |
11 // some_regex.findAllIn(some_string) |
11 // some_regex.findAllIn(some_string) |
12 // |
12 // |
13 // The words should be Returned as a lsit of strings. |
13 // The words should be Returned as a list of strings. |
14 |
14 |
15 def clean(s: String) : List[String] = |
15 def clean(s: String) : List[String] = |
16 ("""\w+""".r).findAllIn(s).toList |
16 ("""\w+""".r).findAllIn(s).toList |
17 |
17 |
18 |
18 |
19 //(2) The function occurences calculates the number of times |
19 //(2) The function occurrences calculates the number of times |
20 // strings occur in a list of strings. These occurences should |
20 // strings occur in a list of strings. These occurrences should |
21 // be calculated as a Map from strings to integers. |
21 // be calculated as a Map from strings to integers. |
22 |
22 |
23 def occurences(xs: List[String]): Map[String, Int] = |
23 def occurrences(xs: List[String]): Map[String, Int] = |
24 (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap |
24 (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap |
25 |
25 |
26 //(3) This functions calculates the dot-product of two documents |
26 //(3) This functions calculates the dot-product of two documents |
27 // (list of strings). For this it calcualtes the occurence |
27 // (list of strings). For this it calculates the occurrence |
28 // maps from (2) and then multiplies the corresponding occurences. |
28 // maps from (2) and then multiplies the corresponding occurrences. |
29 // If a string does not occur in a document, the product is zero. |
29 // If a string does not occur in a document, the product is zero. |
30 // The function finally sums up all products. |
30 // The function finally sums up all products. |
31 |
31 |
32 def prod(lst1: List[String], lst2: List[String]) : Int = { |
32 def prod(lst1: List[String], lst2: List[String]) : Int = { |
33 val words = (lst1 ::: lst2).distinct |
33 val words = (lst1 ::: lst2).distinct |
34 val occs1 = occurences(lst1) |
34 val occs1 = occurrences(lst1) |
35 val occs2 = occurences(lst2) |
35 val occs2 = occurrences(lst2) |
36 words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum |
36 words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum |
37 } |
37 } |
38 |
38 |
39 //(4) Complete the functions overlap and similarity. The overlap of |
39 //(4) Complete the functions overlap and similarity. The overlap of |
40 // two documents is calculated by the formula given in the assignment |
40 // two documents is calculated by the formula given in the assignment |
55 |
55 |
56 |
56 |
57 val list1 = List("a", "b", "b", "c", "d") |
57 val list1 = List("a", "b", "b", "c", "d") |
58 val list2 = List("d", "b", "d", "b", "d") |
58 val list2 = List("d", "b", "d", "b", "d") |
59 |
59 |
60 occurences(List("a", "b", "b", "c", "d")) // Map(a -> 1, b -> 2, c -> 1, d -> 1) |
60 occurrences(List("a", "b", "b", "c", "d")) // Map(a -> 1, b -> 2, c -> 1, d -> 1) |
61 occurences(List("d", "b", "d", "b", "d")) // Map(d -> 3, b -> 2) |
61 occurrences(List("d", "b", "d", "b", "d")) // Map(d -> 3, b -> 2) |
62 |
62 |
63 prod(list1,list2) // 7 |
63 prod(list1,list2) // 7 |
64 |
64 |
65 overlap(list1, list2) // 0.5384615384615384 |
65 overlap(list1, list2) // 0.5384615384615384 |
66 overlap(list2, list1) // 0.5384615384615384 |
66 overlap(list2, list1) // 0.5384615384615384 |