|         |      1 // Core Part 2 about Code Similarity | 
|         |      2 //=================================== | 
|         |      3  | 
|         |      4  | 
|         |      5 object C2 {  | 
|         |      6  | 
|         |      7  | 
|         |      8 //(1) Complete the clean function below. It should find | 
|         |      9 //    all words in a string using the regular expression | 
|         |     10 //    \w+  and the library function  | 
|         |     11 // | 
|         |     12 //         some_regex.findAllIn(some_string) | 
|         |     13 // | 
|         |     14 //    The words should be Returned as a list of strings. | 
|         |     15  | 
|         |     16  | 
|         |     17 def clean(s: String) : List[String] = ??? | 
|         |     18    | 
|         |     19  | 
|         |     20  | 
|         |     21 //(2) The function occurrences calculates the number of times   | 
|         |     22 //    strings occur in a list of strings. These occurrences should  | 
|         |     23 //    be calculated as a Map from strings to integers. | 
|         |     24  | 
|         |     25  | 
|         |     26 def occurrences(xs: List[String]): Map[String, Int] = ??? | 
|         |     27  | 
|         |     28  | 
|         |     29 //(3) This functions calculates the dot-product of two documents | 
|         |     30 //    (list of strings). For this it calculates the occurrence | 
|         |     31 //    maps from (2) and then multiplies the corresponding occurrences.  | 
|         |     32 //    If a string does not occur in a document, the product is zero. | 
|         |     33 //    The function finally sums up all products.  | 
|         |     34  | 
|         |     35  | 
|         |     36 def prod(lst1: List[String], lst2: List[String]) : Int = ??? | 
|         |     37  | 
|         |     38  | 
|         |     39 //(4) Complete the functions overlap and similarity. The overlap of | 
|         |     40 //    two documents is calculated by the formula given in the assignment | 
|         |     41 //    description. The similarity of two strings is given by the overlap | 
|         |     42 //    of the cleaned strings (see (1)).   | 
|         |     43  | 
|         |     44  | 
|         |     45 def overlap(lst1: List[String], lst2: List[String]) : Double = ??? | 
|         |     46  | 
|         |     47 def similarity(s1: String, s2: String) : Double = ??? | 
|         |     48  | 
|         |     49  | 
|         |     50  | 
|         |     51 /* Test cases | 
|         |     52  | 
|         |     53  | 
|         |     54 val list1 = List("a", "b", "b", "c", "d")  | 
|         |     55 val list2 = List("d", "b", "d", "b", "d") | 
|         |     56  | 
|         |     57 occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1) | 
|         |     58 occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2) | 
|         |     59  | 
|         |     60 prod(list1,list2) // 7  | 
|         |     61  | 
|         |     62 overlap(list1, list2)   // 0.5384615384615384 | 
|         |     63 overlap(list2, list1)   // 0.5384615384615384 | 
|         |     64 overlap(list1, list1)   // 1.0 | 
|         |     65 overlap(list2, list2)   // 1.0 | 
|         |     66  | 
|         |     67 // Plagiarism examples from  | 
|         |     68 // https://desales.libguides.com/avoidingplagiarism/examples | 
|         |     69  | 
|         |     70 val orig1 = """There is a strong market demand for eco-tourism in | 
|         |     71 Australia. Its rich and diverse natural heritage ensures Australia's | 
|         |     72 capacity to attract international ecotourists and gives Australia a | 
|         |     73 comparative advantage in the highly competitive tourism industry.""" | 
|         |     74  | 
|         |     75 val plag1 = """There is a high market demand for eco-tourism in | 
|         |     76 Australia. Australia has a comparative advantage in the highly | 
|         |     77 competitive tourism industry due to its rich and varied natural | 
|         |     78 heritage which ensures Australia's capacity to attract international | 
|         |     79 ecotourists.""" | 
|         |     80  | 
|         |     81 similarity(orig1, plag1) // 0.8679245283018868 | 
|         |     82  | 
|         |     83  | 
|         |     84 // Plagiarism examples from  | 
|         |     85 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php | 
|         |     86  | 
|         |     87 val orig2 = """No oil spill is entirely benign. Depending on timing and | 
|         |     88 location, even a relatively minor spill can cause significant harm to | 
|         |     89 individual organisms and entire populations. Oil spills can cause | 
|         |     90 impacts over a range of time scales, from days to years, or even | 
|         |     91 decades for certain spills. Impacts are typically divided into acute | 
|         |     92 (short-term) and chronic (long-term) effects. Both types are part of a | 
|         |     93 complicated and often controversial equation that is addressed after | 
|         |     94 an oil spill: ecosystem recovery.""" | 
|         |     95  | 
|         |     96 val plag2 = """There is no such thing as a "good" oil spill. If the | 
|         |     97 time and place are just right, even a small oil spill can cause damage | 
|         |     98 to sensitive ecosystems. Further, spills can cause harm days, months, | 
|         |     99 years, or even decades after they occur. Because of this, spills are | 
|         |    100 usually broken into short-term (acute) and long-term (chronic) | 
|         |    101 effects. Both of these types of harm must be addressed in ecosystem | 
|         |    102 recovery: a controversial tactic that is often implemented immediately | 
|         |    103 following an oil spill.""" | 
|         |    104  | 
|         |    105 overlap(clean(orig2), clean(plag2))  // 0.728 | 
|         |    106 similarity(orig2, plag2)             // 0.728 | 
|         |    107  | 
|         |    108  | 
|         |    109   | 
|         |    110 // The punchline: everything above 0.6 looks suspicious and  | 
|         |    111 // should be investigated by staff. | 
|         |    112  | 
|         |    113 */ | 
|         |    114  | 
|         |    115 } |