1 // Preliminary Part about Code Similarity |
1 // Preliminary Part about Code Similarity |
2 //======================================== |
2 //======================================== |
3 |
3 |
4 |
4 object CW7a { |
5 object CW7a { |
|
6 |
5 |
7 |
6 |
8 //(1) Complete the clean function below. It should find |
7 //(1) Complete the clean function below. It should find |
9 // all words in a string using the regular expression |
8 // all words in a string using the regular expression |
10 // \w+ and the library function |
9 // \w+ and the library function |
11 // |
10 // |
12 // some_regex.findAllIn(some_string) |
11 // some_regex.findAllIn(some_string) |
13 // |
12 // |
14 // The words should be Returned as a list of strings. |
13 // The words should be Returned as a list of strings. |
15 |
14 |
16 |
15 |
17 def clean(s: String) : List[String] = { |
16 //def clean(s: String) : List[String] = ... |
18 val regex = """\w+""".r; |
17 def clean(s: String) : List[String] = |
19 val list_of_words = s.split(" ").toList |
18 "\\w+".r.findAllIn(s).toList |
20 for(word <- list_of_words; |
|
21 actual_word <- divide_string_where_different(word, regex.findAllIn(word).mkString, 0)) yield actual_word |
|
22 } |
|
23 |
19 |
24 /* |
20 //(2) The function occurrences calculates the number of times |
25 A secondary function that takes as parameters @param original which is the original word, @param returned which is thea word after the process of removing |
21 // strings occur in a list of strings. These occurrences should |
26 some characters not allowed by a regular expression, and @param i which is the index where to start compare the characters of the two words. |
|
27 It @return a List of strings which represents all the substrings of returned which were previously divided by characters not allowed by the regular expression applied on it. |
|
28 */ |
|
29 def divide_string_where_different(original: String, returned: String, i : Int): List[String] ={ |
|
30 val max_i = original.length -1 |
|
31 if(original(i) != returned(i)) returned.substring(0, i)::divide_string_where_different(original.substring(i+1), returned.substring(i), 0).filter(_.nonEmpty) |
|
32 else if (i == max_i) List(returned) |
|
33 else divide_string_where_different(original,returned, i +1) |
|
34 |
|
35 } |
|
36 |
|
37 //(2) The function occurrences calculates the number of times |
|
38 // strings occur in a list of strings. These occurrences should |
|
39 // be calculated as a Map from strings to integers. |
22 // be calculated as a Map from strings to integers. |
40 |
23 |
41 |
24 |
42 def occurrences(xs: List[String]): Map[String, Int] = { |
25 //def occurrences(xs: List[String]): Map[String, Int] = .. |
43 val lst = xs.distinct |
26 def occurrences(xs: List[String]) : Map[String, Int] = |
44 val word_pairs = (for (word <- lst) yield (word, xs.count(_==word))).toList |
27 xs.groupBy(identity).view.mapValues(_.size).toMap |
45 word_pairs.toMap |
|
46 } |
|
47 |
|
48 |
|
49 |
28 |
50 //(3) This functions calculates the dot-product of two documents |
29 //(3) This functions calculates the dot-product of two documents |
51 // (list of strings). For this it calculates the occurrence |
30 // (list of strings). For this it calculates the occurrence |
52 // maps from (2) and then multiplies the corresponding occurrences. |
31 // maps from (2) and then multiplies the corresponding occurrences. |
53 // If a string does not occur in a document, the product is zero. |
32 // If a string does not occur in a document, the product is zero. |
54 // The function finally sums up all products. |
33 // The function finally sums up all products. |
55 |
34 |
56 |
35 |
57 def prod(lst1: List[String], lst2: List[String]) : Int = { |
36 //def prod(lst1: List[String], lst2: List[String]) : Int = .. |
58 val map1 = occurrences(lst1) |
37 def prod(lst1: List[String], lst2: List[String]) : Int = |
59 val map2 = occurrences(lst2) |
38 occurrences(lst1).map(x => occurrences(lst2).getOrElse(x._1, 0) * x._2).reduce(_ + _) |
60 print(s"map1 is $map1 \n and map2 is $map2") |
|
61 val pairs = (for(pair1 <- map1 if(map2.get(pair1._1) != None)) yield (pair1._2, map2.get(pair1._1).get)).toList |
|
62 print(s"\n pairs are $pairs") |
|
63 val products = (for(pair <- pairs) yield pair._1 * pair._2).toList |
|
64 products.sum |
|
65 |
|
66 } |
|
67 |
|
68 |
39 |
69 //(4) Complete the functions overlap and similarity. The overlap of |
40 //(4) Complete the functions overlap and similarity. The overlap of |
70 // two documents is calculated by the formula given in the assignment |
41 // two documents is calculated by the formula given in the assignment |
71 // description. The similarity of two strings is given by the overlap |
42 // description. The similarity of two strings is given by the overlap |
72 // of the cleaned strings (see (1)). |
43 // of the cleaned strings (see (1)). |
73 |
44 |
74 |
45 |
75 //def overlap(lst1: List[String], lst2: List[String]) : Double = ... |
46 //def overlap(lst1: List[String], lst2: List[String]) : Double = ... |
76 |
47 def overlap(lst1: List[String], lst2: List[String]) : Double = |
|
48 prod(lst1, lst2).toDouble/Math.max(prod(lst1, lst1).toDouble, prod(lst2, lst2).toDouble) |
77 //def similarity(s1: String, s2: String) : Double = ... |
49 //def similarity(s1: String, s2: String) : Double = ... |
78 |
50 def similarity(s1: String, s2: String) : Double = |
79 |
51 overlap(clean(s1), clean(s2)) |
80 |
52 |
81 |
53 |
82 /* Test cases |
54 /* Test cases |
83 |
55 import CW7a._ |
84 |
56 val list1 = List("a", "b", "b", "c", "d") |
85 val list1 = List("a", "b", "b", "c", "d") |
|
86 val list2 = List("d", "b", "d", "b", "d") |
57 val list2 = List("d", "b", "d", "b", "d") |
87 |
58 occurrences(List("a", "b", "b", "c", "d")) |
88 occurrences(List("a", "b", "b", "c", "d")) // Map(a -> 1, b -> 2, c -> 1, d -> 1) |
59 occurrences(List("d", "b", "d", "b", "d")) |
89 occurrences(List("d", "b", "d", "b", "d")) // Map(d -> 3, b -> 2) |
60 prod(list1,list2) // 7 |
90 |
|
91 prod(list1,list2) // 7 |
|
92 prod(list1,list1) |
|
93 prod(list2,list2) |
|
94 |
|
95 overlap(list1, list2) // 0.5384615384615384 |
61 overlap(list1, list2) // 0.5384615384615384 |
96 overlap(list2, list1) // 0.5384615384615384 |
62 overlap(list2, list1) // 0.5384615384615384 |
97 overlap(list1, list1) // 1.0 |
63 overlap(list1, list1) // 1.0 |
98 overlap(list2, list2) // 1.0 |
64 overlap(list2, list2) // 1.0 |
99 |
65 // Plagiarism examples from |
100 // Plagiarism examples from |
|
101 // https://desales.libguides.com/avoidingplagiarism/examples |
66 // https://desales.libguides.com/avoidingplagiarism/examples |
102 |
|
103 val orig1 = """There is a strong market demand for eco-tourism in |
67 val orig1 = """There is a strong market demand for eco-tourism in |
104 Australia. Its rich and diverse natural heritage ensures Australia's |
68 Australia. Its rich and diverse natural heritage ensures Australia's |
105 capacity to attract international ecotourists and gives Australia a |
69 capacity to attract international ecotourists and gives Australia a |
106 comparative advantage in the highly competitive tourism industry.""" |
70 comparative advantage in the highly competitive tourism industry.""" |
107 |
|
108 val plag1 = """There is a high market demand for eco-tourism in |
71 val plag1 = """There is a high market demand for eco-tourism in |
109 Australia. Australia has a comparative advantage in the highly |
72 Australia. Australia has a comparative advantage in the highly |
110 competitive tourism industry due to its rich and varied natural |
73 competitive tourism industry due to its rich and varied natural |
111 heritage which ensures Australia's capacity to attract international |
74 heritage which ensures Australia's capacity to attract international |
112 ecotourists.""" |
75 ecotourists.""" |
113 |
|
114 similarity(orig1, plag1) // 0.8679245283018868 |
76 similarity(orig1, plag1) // 0.8679245283018868 |
115 |
77 // Plagiarism examples from |
116 |
|
117 // Plagiarism examples from |
|
118 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php |
78 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php |
119 |
|
120 val orig2 = """No oil spill is entirely benign. Depending on timing and |
79 val orig2 = """No oil spill is entirely benign. Depending on timing and |
121 location, even a relatively minor spill can cause significant harm to |
80 location, even a relatively minor spill can cause significant harm to |
122 individual organisms and entire populations. Oil spills can cause |
81 individual organisms and entire populations. Oil spills can cause |
123 impacts over a range of time scales, from days to years, or even |
82 impacts over a range of time scales, from days to years, or even |
124 decades for certain spills. Impacts are typically divided into acute |
83 decades for certain spills. Impacts are typically divided into acute |
125 (short-term) and chronic (long-term) effects. Both types are part of a |
84 (short-term) and chronic (long-term) effects. Both types are part of a |
126 complicated and often controversial equation that is addressed after |
85 complicated and often controversial equation that is addressed after |
127 an oil spill: ecosystem recovery.""" |
86 an oil spill: ecosystem recovery.""" |
128 |
|
129 val plag2 = """There is no such thing as a "good" oil spill. If the |
87 val plag2 = """There is no such thing as a "good" oil spill. If the |
130 time and place are just right, even a small oil spill can cause damage |
88 time and place are just right, even a small oil spill can cause damage |
131 to sensitive ecosystems. Further, spills can cause harm days, months, |
89 to sensitive ecosystems. Further, spills can cause harm days, months, |
132 years, or even decades after they occur. Because of this, spills are |
90 years, or even decades after they occur. Because of this, spills are |
133 usually broken into short-term (acute) and long-term (chronic) |
91 usually broken into short-term (acute) and long-term (chronic) |
134 effects. Both of these types of harm must be addressed in ecosystem |
92 effects. Both of these types of harm must be addressed in ecosystem |
135 recovery: a controversial tactic that is often implemented immediately |
93 recovery: a controversial tactic that is often implemented immediately |
136 following an oil spill.""" |
94 following an oil spill.""" |
137 |
|
138 overlap(clean(orig2), clean(plag2)) // 0.728 |
95 overlap(clean(orig2), clean(plag2)) // 0.728 |
139 similarity(orig2, plag2) // 0.728 |
96 similarity(orig2, plag2) // 0.728 |
140 |
97 // The punchline: everything above 0.6 looks suspicious and |
141 |
|
142 |
|
143 // The punchline: everything above 0.6 looks suspicious and |
|
144 // should be investigated by staff. |
98 // should be investigated by staff. |
145 |
|
146 */ |
99 */ |
147 |
100 |
148 } |
101 } |
149 |
|