1 // Preliminary Part about Code Similarity |
1 // Preliminary Part about Code Similarity |
2 //======================================== |
2 //======================================== |
3 |
3 |
4 |
4 |
5 object CW7a { // for purposes of generating a jar |
5 object CW7a { |
|
6 |
6 |
7 |
7 //(1) Complete the clean function below. It should find |
8 //(1) Complete the clean function below. It should find |
8 // all words in a string using the regular expression |
9 // all words in a string using the regular expression |
9 // \w+ and the library function |
10 // \w+ and the library function |
10 // |
11 // |
11 // some_regex.findAllIn(some_string) |
12 // some_regex.findAllIn(some_string) |
12 // |
13 // |
13 // The words should be Returned as a list of strings. |
14 // The words should be Returned as a list of strings. |
14 |
15 |
15 def clean(s: String) : List[String] = |
|
16 ("""\w+""".r).findAllIn(s).toList |
|
17 |
16 |
|
17 def clean(s: String) : List[String] = { |
|
18 val regex = """\w+""".r; |
|
19 val list_of_words = s.split(" ").toList |
|
20 for(word <- list_of_words; |
|
21 actual_word <- divide_string_where_different(word, regex.findAllIn(word).mkString, 0)) yield actual_word |
|
22 } |
|
23 |
|
24 /* |
|
25 A secondary function that takes as parameters @param original which is the original word, @param returned which is thea word after the process of removing |
|
26 some characters not allowed by a regular expression, and @param i which is the index where to start compare the characters of the two words. |
|
27 It @return a List of strings which represents all the substrings of returned which were previously divided by characters not allowed by the regular expression applied on it. |
|
28 */ |
|
29 def divide_string_where_different(original: String, returned: String, i : Int): List[String] ={ |
|
30 val max_i = original.length -1 |
|
31 if(original(i) != returned(i)) returned.substring(0, i)::divide_string_where_different(original.substring(i+1), returned.substring(i), 0).filter(_.nonEmpty) |
|
32 else if (i == max_i) List(returned) |
|
33 else divide_string_where_different(original,returned, i +1) |
|
34 |
|
35 } |
18 |
36 |
19 //(2) The function occurrences calculates the number of times |
37 //(2) The function occurrences calculates the number of times |
20 // strings occur in a list of strings. These occurrences should |
38 // strings occur in a list of strings. These occurrences should |
21 // be calculated as a Map from strings to integers. |
39 // be calculated as a Map from strings to integers. |
22 |
40 |
23 def occurrences(xs: List[String]): Map[String, Int] = |
41 |
24 (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap |
42 def occurrences(xs: List[String]): Map[String, Int] = { |
|
43 val lst = xs.distinct |
|
44 val word_pairs = (for (word <- lst) yield (word, xs.count(_==word))).toList |
|
45 word_pairs.toMap |
|
46 } |
|
47 |
|
48 |
25 |
49 |
26 //(3) This functions calculates the dot-product of two documents |
50 //(3) This functions calculates the dot-product of two documents |
27 // (list of strings). For this it calculates the occurrence |
51 // (list of strings). For this it calculates the occurrence |
28 // maps from (2) and then multiplies the corresponding occurrences. |
52 // maps from (2) and then multiplies the corresponding occurrences. |
29 // If a string does not occur in a document, the product is zero. |
53 // If a string does not occur in a document, the product is zero. |
30 // The function finally sums up all products. |
54 // The function finally sums up all products. |
31 |
55 |
|
56 |
32 def prod(lst1: List[String], lst2: List[String]) : Int = { |
57 def prod(lst1: List[String], lst2: List[String]) : Int = { |
33 val words = (lst1 ::: lst2).distinct |
58 val map1 = occurrences(lst1) |
34 val occs1 = occurrences(lst1) |
59 val map2 = occurrences(lst2) |
35 val occs2 = occurrences(lst2) |
60 print(s"map1 is $map1 \n and map2 is $map2") |
36 words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum |
61 val pairs = (for(pair1 <- map1 if(map2.get(pair1._1) != None)) yield (pair1._2, map2.get(pair1._1).get)).toList |
|
62 print(s"\n pairs are $pairs") |
|
63 val products = (for(pair <- pairs) yield pair._1 * pair._2).toList |
|
64 products.sum |
|
65 |
37 } |
66 } |
|
67 |
38 |
68 |
39 //(4) Complete the functions overlap and similarity. The overlap of |
69 //(4) Complete the functions overlap and similarity. The overlap of |
40 // two documents is calculated by the formula given in the assignment |
70 // two documents is calculated by the formula given in the assignment |
41 // description. The similarity of two strings is given by the overlap |
71 // description. The similarity of two strings is given by the overlap |
42 // of the cleaned (see (1)) strings. |
72 // of the cleaned strings (see (1)). |
43 |
|
44 def overlap(lst1: List[String], lst2: List[String]) : Double = { |
|
45 val m1 = prod(lst1, lst1) |
|
46 val m2 = prod(lst2, lst2) |
|
47 prod(lst1, lst2).toDouble / (List(m1, m2).max) |
|
48 } |
|
49 |
|
50 def similarity(s1: String, s2: String) : Double = |
|
51 overlap(clean(s1), clean(s2)) |
|
52 |
73 |
53 |
74 |
54 /* |
75 //def overlap(lst1: List[String], lst2: List[String]) : Double = ... |
|
76 |
|
77 //def similarity(s1: String, s2: String) : Double = ... |
|
78 |
|
79 |
|
80 |
|
81 |
|
82 /* Test cases |
55 |
83 |
56 |
84 |
57 val list1 = List("a", "b", "b", "c", "d") |
85 val list1 = List("a", "b", "b", "c", "d") |
58 val list2 = List("d", "b", "d", "b", "d") |
86 val list2 = List("d", "b", "d", "b", "d") |
59 |
87 |
60 occurrences(List("a", "b", "b", "c", "d")) // Map(a -> 1, b -> 2, c -> 1, d -> 1) |
88 occurrences(List("a", "b", "b", "c", "d")) // Map(a -> 1, b -> 2, c -> 1, d -> 1) |
61 occurrences(List("d", "b", "d", "b", "d")) // Map(d -> 3, b -> 2) |
89 occurrences(List("d", "b", "d", "b", "d")) // Map(d -> 3, b -> 2) |
62 |
90 |
63 prod(list1,list2) // 7 |
91 prod(list1,list2) // 7 |
|
92 prod(list1,list1) |
|
93 prod(list2,list2) |
64 |
94 |
65 overlap(list1, list2) // 0.5384615384615384 |
95 overlap(list1, list2) // 0.5384615384615384 |
66 overlap(list2, list1) // 0.5384615384615384 |
96 overlap(list2, list1) // 0.5384615384615384 |
67 overlap(list1, list1) // 1.0 |
97 overlap(list1, list1) // 1.0 |
68 overlap(list2, list2) // 1.0 |
98 overlap(list2, list2) // 1.0 |