1 // Preliminary Part about Code Similarity |
1 // Core Part 2 about Code Similarity |
2 //======================================== |
2 //=================================== |
3 |
3 |
4 |
4 |
5 object C2 { |
5 object C2 { |
6 |
6 |
7 //(1) Complete the clean function below. It should find |
7 // ADD YOUR CODE BELOW |
8 // all words in a string using the regular expression |
8 //====================== |
9 // \w+ and the library function |
|
10 // |
|
11 // some_regex.findAllIn(some_string) |
|
12 // |
|
13 // The words should be Returned as a list of strings. |
|
14 |
9 |
15 def clean(s: String) : List[String] = |
10 //(1) |
16 ("""\w+""".r).findAllIn(s).toList |
11 def clean(s: String) : List[String] = """(\w+)""".r.findAllIn(s).toList |
|
12 |
17 |
13 |
18 |
14 |
19 //(2) The function occurrences calculates the number of times |
15 //(2) |
20 // strings occur in a list of strings. These occurrences should |
16 def occurrences(xs: List[String]): Map[String, Int] = { |
21 // be calculated as a Map from strings to integers. |
17 val ls = xs.distinct |
22 |
18 val occLs = for (s <- ls) yield (s, xs.count(_.equals(s))) |
23 def occurrences(xs: List[String]): Map[String, Int] = |
19 occLs.toMap |
24 (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap |
|
25 |
|
26 //(3) This functions calculates the dot-product of two documents |
|
27 // (list of strings). For this it calculates the occurrence |
|
28 // maps from (2) and then multiplies the corresponding occurrences. |
|
29 // If a string does not occur in a document, the product is zero. |
|
30 // The function finally sums up all products. |
|
31 |
|
32 def prod(lst1: List[String], lst2: List[String]) : Int = { |
|
33 val words = (lst1 ::: lst2).distinct |
|
34 val occs1 = occurrences(lst1) |
|
35 val occs2 = occurrences(lst2) |
|
36 words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum |
|
37 } |
|
38 |
|
39 //(4) Complete the functions overlap and similarity. The overlap of |
|
40 // two documents is calculated by the formula given in the assignment |
|
41 // description. The similarity of two strings is given by the overlap |
|
42 // of the cleaned (see (1)) strings. |
|
43 |
|
44 def overlap(lst1: List[String], lst2: List[String]) : Double = { |
|
45 val m1 = prod(lst1, lst1) |
|
46 val m2 = prod(lst2, lst2) |
|
47 prod(lst1, lst2).toDouble / (List(m1, m2).max) |
|
48 } |
20 } |
49 |
21 |
50 def similarity(s1: String, s2: String) : Double = |
22 |
51 overlap(clean(s1), clean(s2)) |
23 //(3) |
|
24 def prod(lst1: List[String], lst2: List[String]) : Int = { |
|
25 val occM1 = occurrences(lst1) |
|
26 val occM2 = occurrences(lst2) |
|
27 (for (s <- occM1) yield s._2 * occM2.getOrElse(s._1,0)).sum |
|
28 } |
52 |
29 |
53 |
30 |
54 /* |
31 //(4) |
|
32 def overlap(lst1: List[String], lst2: List[String]) : Double = prod(lst1,lst2) / prod(lst1,lst1).max(prod(lst2,lst2)) |
|
33 |
|
34 def similarity(s1: String, s2: String) : Double = overlap(clean(s1), clean(s2)) |
|
35 |
|
36 |
|
37 |
|
38 /* Test cases |
55 |
39 |
56 |
40 |
57 val list1 = List("a", "b", "b", "c", "d") |
41 val list1 = List("a", "b", "b", "c", "d") |
58 val list2 = List("d", "b", "d", "b", "d") |
42 val list2 = List("d", "b", "d", "b", "d") |
59 |
43 |
79 Australia. Australia has a comparative advantage in the highly |
63 Australia. Australia has a comparative advantage in the highly |
80 competitive tourism industry due to its rich and varied natural |
64 competitive tourism industry due to its rich and varied natural |
81 heritage which ensures Australia's capacity to attract international |
65 heritage which ensures Australia's capacity to attract international |
82 ecotourists.""" |
66 ecotourists.""" |
83 |
67 |
84 similarity(orig1, plag1) |
68 similarity(orig1, plag1) // 0.8679245283018868 |
85 |
69 |
86 |
70 |
87 // Plagiarism examples from |
71 // Plagiarism examples from |
88 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php |
72 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php |
89 |
73 |
103 usually broken into short-term (acute) and long-term (chronic) |
87 usually broken into short-term (acute) and long-term (chronic) |
104 effects. Both of these types of harm must be addressed in ecosystem |
88 effects. Both of these types of harm must be addressed in ecosystem |
105 recovery: a controversial tactic that is often implemented immediately |
89 recovery: a controversial tactic that is often implemented immediately |
106 following an oil spill.""" |
90 following an oil spill.""" |
107 |
91 |
108 overlap(clean(orig2), clean(plag2)) |
92 overlap(clean(orig2), clean(plag2)) // 0.728 |
109 similarity(orig2, plag2) |
93 similarity(orig2, plag2) // 0.728 |
110 |
94 |
|
95 |
|
96 |
111 // The punchline: everything above 0.6 looks suspicious and |
97 // The punchline: everything above 0.6 looks suspicious and |
112 // should be looked at by staff. |
98 // should be investigated by staff. |
113 |
99 |
114 */ |
100 */ |
115 |
101 |
116 |
|
117 } |
102 } |