|      1 // Preliminary Part about Code Similarity |      1 // Core Part 2 about Code Similarity | 
|      2 //======================================== |      2 //=================================== | 
|      3  |      3  | 
|      4  |      4  | 
|      5 object C2 {  |      5 object C2 {  | 
|      6  |      6  | 
|      7 //(1) Complete the clean function below. It should find |      7 // ADD YOUR CODE BELOW | 
|      8 //    all words in a string using the regular expression |      8 //====================== | 
|      9 //    \w+  and the library function  |         | 
|     10 // |         | 
|     11 //         some_regex.findAllIn(some_string) |         | 
|     12 // |         | 
|     13 //    The words should be Returned as a list of strings. |         | 
|     14  |      9  | 
|     15 def clean(s: String) : List[String] =  |     10 //(1) | 
|     16   ("""\w+""".r).findAllIn(s).toList |     11 def clean(s: String) : List[String] = """(\w+)""".r.findAllIn(s).toList | 
|         |     12    | 
|     17  |     13  | 
|     18  |     14  | 
|     19 //(2) The function occurrences calculates the number of times   |     15 //(2) | 
|     20 //    strings occur in a list of strings. These occurrences should  |     16 def occurrences(xs: List[String]): Map[String, Int] = { | 
|     21 //    be calculated as a Map from strings to integers. |     17     val ls = xs.distinct | 
|     22  |     18     val occLs = for (s <- ls) yield (s, xs.count(_.equals(s))) | 
|     23 def occurrences(xs: List[String]): Map[String, Int] = |     19     occLs.toMap | 
|     24   (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap |         | 
|     25  |         | 
|     26 //(3) This functions calculates the dot-product of two documents |         | 
|     27 //    (list of strings). For this it calculates the occurrence |         | 
|     28 //    maps from (2) and then multiplies the corresponding occurrences.  |         | 
|     29 //    If a string does not occur in a document, the product is zero. |         | 
|     30 //    The function finally sums up all products.  |         | 
|     31  |         | 
|     32 def prod(lst1: List[String], lst2: List[String]) : Int = { |         | 
|     33     val words = (lst1 ::: lst2).distinct |         | 
|     34     val occs1 = occurrences(lst1) |         | 
|     35     val occs2 = occurrences(lst2) |         | 
|     36     words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum |         | 
|     37 }           |         | 
|     38  |         | 
|     39 //(4) Complete the functions overlap and similarity. The overlap of |         | 
|     40 //    two documents is calculated by the formula given in the assignment |         | 
|     41 //    description. The similarity of two strings is given by the overlap |         | 
|     42 //    of the cleaned (see (1)) strings.   |         | 
|     43  |         | 
|     44 def overlap(lst1: List[String], lst2: List[String]) : Double = { |         | 
|     45     val m1 = prod(lst1, lst1) |         | 
|     46     val m2 = prod(lst2, lst2)  |         | 
|     47     prod(lst1, lst2).toDouble / (List(m1, m2).max) |         | 
|     48 } |     20 } | 
|     49  |     21  | 
|     50 def similarity(s1: String, s2: String) : Double = |     22  | 
|     51   overlap(clean(s1), clean(s2)) |     23 //(3) | 
|         |     24 def prod(lst1: List[String], lst2: List[String]) : Int = { | 
|         |     25     val occM1 = occurrences(lst1) | 
|         |     26     val occM2 = occurrences(lst2) | 
|         |     27     (for (s <- occM1) yield s._2 * occM2.getOrElse(s._1,0)).sum | 
|         |     28 } | 
|     52  |     29  | 
|     53  |     30  | 
|     54 /* |     31 //(4) | 
|         |     32 def overlap(lst1: List[String], lst2: List[String]) : Double = prod(lst1,lst2) / prod(lst1,lst1).max(prod(lst2,lst2)) | 
|         |     33  | 
|         |     34 def similarity(s1: String, s2: String) : Double = overlap(clean(s1), clean(s2)) | 
|         |     35  | 
|         |     36  | 
|         |     37  | 
|         |     38 /* Test cases | 
|     55  |     39  | 
|     56  |     40  | 
|     57 val list1 = List("a", "b", "b", "c", "d")  |     41 val list1 = List("a", "b", "b", "c", "d")  | 
|     58 val list2 = List("d", "b", "d", "b", "d") |     42 val list2 = List("d", "b", "d", "b", "d") | 
|     59  |     43  | 
|     79 Australia. Australia has a comparative advantage in the highly |     63 Australia. Australia has a comparative advantage in the highly | 
|     80 competitive tourism industry due to its rich and varied natural |     64 competitive tourism industry due to its rich and varied natural | 
|     81 heritage which ensures Australia's capacity to attract international |     65 heritage which ensures Australia's capacity to attract international | 
|     82 ecotourists.""" |     66 ecotourists.""" | 
|     83  |     67  | 
|     84 similarity(orig1, plag1) |     68 similarity(orig1, plag1) // 0.8679245283018868 | 
|     85  |     69  | 
|     86  |     70  | 
|     87 // Plagiarism examples from  |     71 // Plagiarism examples from  | 
|     88 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php |     72 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php | 
|     89  |     73  | 
|    103 usually broken into short-term (acute) and long-term (chronic) |     87 usually broken into short-term (acute) and long-term (chronic) | 
|    104 effects. Both of these types of harm must be addressed in ecosystem |     88 effects. Both of these types of harm must be addressed in ecosystem | 
|    105 recovery: a controversial tactic that is often implemented immediately |     89 recovery: a controversial tactic that is often implemented immediately | 
|    106 following an oil spill.""" |     90 following an oil spill.""" | 
|    107  |     91  | 
|    108 overlap(clean(orig2), clean(plag2)) |     92 overlap(clean(orig2), clean(plag2))  // 0.728 | 
|    109 similarity(orig2, plag2) |     93 similarity(orig2, plag2)             // 0.728 | 
|    110  |     94  | 
|         |     95  | 
|         |     96   | 
|    111 // The punchline: everything above 0.6 looks suspicious and  |     97 // The punchline: everything above 0.6 looks suspicious and  | 
|    112 // should be looked at by staff. |     98 // should be investigated by staff. | 
|    113  |     99  | 
|    114 */ |    100 */ | 
|    115  |    101  | 
|    116  |         | 
|    117 } |    102 } |