|      8 //    all words in a string using the regular expression |      8 //    all words in a string using the regular expression | 
|      9 //    \w+  and the library function  |      9 //    \w+  and the library function  | 
|     10 // |     10 // | 
|     11 //         some_regex.findAllIn(some_string) |     11 //         some_regex.findAllIn(some_string) | 
|     12 // |     12 // | 
|     13 //    The words should be Returned as a lsit of strings. |     13 //    The words should be Returned as a list of strings. | 
|     14  |     14  | 
|     15 def clean(s: String) : List[String] =  |     15 def clean(s: String) : List[String] =  | 
|     16   ("""\w+""".r).findAllIn(s).toList |     16   ("""\w+""".r).findAllIn(s).toList | 
|     17  |     17  | 
|     18  |     18  | 
|     19 //(2) The function occurences calculates the number of times   |     19 //(2) The function occurrences calculates the number of times   | 
|     20 //    strings occur in a list of strings. These occurences should  |     20 //    strings occur in a list of strings. These occurrences should  | 
|     21 //    be calculated as a Map from strings to integers. |     21 //    be calculated as a Map from strings to integers. | 
|     22  |     22  | 
|     23 def occurences(xs: List[String]): Map[String, Int] = |     23 def occurrences(xs: List[String]): Map[String, Int] = | 
|     24   (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap |     24   (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap | 
|     25  |     25  | 
|     26 //(3) This functions calculates the dot-product of two documents |     26 //(3) This functions calculates the dot-product of two documents | 
|     27 //    (list of strings). For this it calcualtes the occurence |     27 //    (list of strings). For this it calculates the occurrence | 
|     28 //    maps from (2) and then multiplies the corresponding occurences.  |     28 //    maps from (2) and then multiplies the corresponding occurrences.  | 
|     29 //    If a string does not occur in a document, the product is zero. |     29 //    If a string does not occur in a document, the product is zero. | 
|     30 //    The function finally sums up all products.  |     30 //    The function finally sums up all products.  | 
|     31  |     31  | 
|     32 def prod(lst1: List[String], lst2: List[String]) : Int = { |     32 def prod(lst1: List[String], lst2: List[String]) : Int = { | 
|     33     val words = (lst1 ::: lst2).distinct |     33     val words = (lst1 ::: lst2).distinct | 
|     34     val occs1 = occurences(lst1) |     34     val occs1 = occurrences(lst1) | 
|     35     val occs2 = occurences(lst2) |     35     val occs2 = occurrences(lst2) | 
|     36     words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum |     36     words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum | 
|     37 } |     37 } | 
|     38  |     38  | 
|     39 //(4) Complete the functions overlap and similarity. The overlap of |     39 //(4) Complete the functions overlap and similarity. The overlap of | 
|     40 //    two documents is calculated by the formula given in the assignment |     40 //    two documents is calculated by the formula given in the assignment | 
|     55  |     55  | 
|     56  |     56  | 
|     57 val list1 = List("a", "b", "b", "c", "d")  |     57 val list1 = List("a", "b", "b", "c", "d")  | 
|     58 val list2 = List("d", "b", "d", "b", "d") |     58 val list2 = List("d", "b", "d", "b", "d") | 
|     59  |     59  | 
|     60 occurences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1) |     60 occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1) | 
|     61 occurences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2) |     61 occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2) | 
|     62  |     62  | 
|     63 prod(list1,list2) // 7  |     63 prod(list1,list2) // 7  | 
|     64  |     64  | 
|     65 overlap(list1, list2)   // 0.5384615384615384 |     65 overlap(list1, list2)   // 0.5384615384615384 | 
|     66 overlap(list2, list1)   // 0.5384615384615384 |     66 overlap(list2, list1)   // 0.5384615384615384 |