| 412 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      1 | // Core Part 2 about Code Similarity
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      2 | //===================================
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      3 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      4 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      5 | object C2 { 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      6 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      7 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      8 | //(1) Complete the clean function below. It should find
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |      9 | //    all words in a string using the regular expression
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     10 | //    \w+  and the library function 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     11 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     12 | //         some_regex.findAllIn(some_string)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     13 | //
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     14 | //    The words should be Returned as a list of strings.
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     15 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     16 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     17 | def clean(s: String) : List[String] = ???
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     18 |   
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     19 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     20 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     21 | //(2) The function occurrences calculates the number of times  
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     22 | //    strings occur in a list of strings. These occurrences should 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     23 | //    be calculated as a Map from strings to integers.
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     24 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     25 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     26 | def occurrences(xs: List[String]): Map[String, Int] = ???
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     27 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     28 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     29 | //(3) This functions calculates the dot-product of two documents
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     30 | //    (list of strings). For this it calculates the occurrence
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     31 | //    maps from (2) and then multiplies the corresponding occurrences. 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     32 | //    If a string does not occur in a document, the product is zero.
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     33 | //    The function finally sums up all products. 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     34 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     35 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     36 | def prod(lst1: List[String], lst2: List[String]) : Int = ???
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     37 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     38 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     39 | //(4) Complete the functions overlap and similarity. The overlap of
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     40 | //    two documents is calculated by the formula given in the assignment
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     41 | //    description. The similarity of two strings is given by the overlap
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     42 | //    of the cleaned strings (see (1)).  
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     43 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     44 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     45 | def overlap(lst1: List[String], lst2: List[String]) : Double = ???
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     46 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     47 | def similarity(s1: String, s2: String) : Double = ???
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     48 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     49 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     50 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     51 | /* Test cases
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     52 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     53 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     54 | val list1 = List("a", "b", "b", "c", "d") 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     55 | val list2 = List("d", "b", "d", "b", "d")
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     56 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     57 | occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     58 | occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     59 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     60 | prod(list1,list2) // 7 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     61 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     62 | overlap(list1, list2)   // 0.5384615384615384
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     63 | overlap(list2, list1)   // 0.5384615384615384
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     64 | overlap(list1, list1)   // 1.0
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     65 | overlap(list2, list2)   // 1.0
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     66 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     67 | // Plagiarism examples from 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     68 | // https://desales.libguides.com/avoidingplagiarism/examples
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     69 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     70 | val orig1 = """There is a strong market demand for eco-tourism in
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     71 | Australia. Its rich and diverse natural heritage ensures Australia's
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     72 | capacity to attract international ecotourists and gives Australia a
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     73 | comparative advantage in the highly competitive tourism industry."""
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     74 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     75 | val plag1 = """There is a high market demand for eco-tourism in
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     76 | Australia. Australia has a comparative advantage in the highly
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     77 | competitive tourism industry due to its rich and varied natural
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     78 | heritage which ensures Australia's capacity to attract international
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     79 | ecotourists."""
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     80 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     81 | similarity(orig1, plag1) // 0.8679245283018868
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     82 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     83 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     84 | // Plagiarism examples from 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     85 | // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     86 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     87 | val orig2 = """No oil spill is entirely benign. Depending on timing and
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     88 | location, even a relatively minor spill can cause significant harm to
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     89 | individual organisms and entire populations. Oil spills can cause
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     90 | impacts over a range of time scales, from days to years, or even
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     91 | decades for certain spills. Impacts are typically divided into acute
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     92 | (short-term) and chronic (long-term) effects. Both types are part of a
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     93 | complicated and often controversial equation that is addressed after
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     94 | an oil spill: ecosystem recovery."""
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     95 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     96 | val plag2 = """There is no such thing as a "good" oil spill. If the
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     97 | time and place are just right, even a small oil spill can cause damage
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     98 | to sensitive ecosystems. Further, spills can cause harm days, months,
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |     99 | years, or even decades after they occur. Because of this, spills are
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    100 | usually broken into short-term (acute) and long-term (chronic)
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    101 | effects. Both of these types of harm must be addressed in ecosystem
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    102 | recovery: a controversial tactic that is often implemented immediately
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    103 | following an oil spill."""
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    104 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    105 | overlap(clean(orig2), clean(plag2))  // 0.728
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    106 | similarity(orig2, plag2)             // 0.728
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    107 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    108 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    109 |  
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    110 | // The punchline: everything above 0.6 looks suspicious and 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    111 | // should be investigated by staff.
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    112 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    113 | */
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    114 | 
 | 
| 
Christian Urban <christian.urban@kcl.ac.uk> parents: diff
changeset |    115 | }
 |