| 283 |      1 | // Preliminary Part about Code Similarity
 | 
|  |      2 | //========================================
 | 
| 211 |      3 | 
 | 
|  |      4 | 
 | 
| 320 |      5 | object CW7a { 
 | 
|  |      6 | 
 | 
| 211 |      7 | 
 | 
|  |      8 | //(1) Complete the clean function below. It should find
 | 
|  |      9 | //    all words in a string using the regular expression
 | 
|  |     10 | //    \w+  and the library function 
 | 
|  |     11 | //
 | 
|  |     12 | //         some_regex.findAllIn(some_string)
 | 
|  |     13 | //
 | 
|  |     14 | //    The words should be Returned as a list of strings.
 | 
|  |     15 | 
 | 
| 320 |     16 | 
 | 
|  |     17 | def clean(s: String) : List[String] = {
 | 
|  |     18 |     val regex = """\w+""".r;
 | 
|  |     19 |     val list_of_words = s.split(" ").toList
 | 
|  |     20 |     for(word <- list_of_words;
 | 
|  |     21 |         actual_word <- divide_string_where_different(word, regex.findAllIn(word).mkString, 0)) yield actual_word
 | 
|  |     22 | }
 | 
| 211 |     23 | 
 | 
| 320 |     24 | /*
 | 
|  |     25 |     A secondary function that takes as parameters @param original which is the original word, @param returned which is thea word after the process of removing 
 | 
|  |     26 |     some characters not allowed by a regular expression, and @param i which is the index where to start compare the characters of the two words.
 | 
|  |     27 |     It @return a List of strings which represents all the substrings of returned which were previously divided by characters not allowed by the regular expression applied on it.
 | 
|  |     28 | */
 | 
|  |     29 | def divide_string_where_different(original: String, returned: String, i : Int): List[String] ={
 | 
|  |     30 |     val max_i = original.length -1
 | 
|  |     31 |     if(original(i) != returned(i)) returned.substring(0, i)::divide_string_where_different(original.substring(i+1), returned.substring(i), 0).filter(_.nonEmpty)
 | 
|  |     32 |     else if (i == max_i) List(returned)
 | 
|  |     33 |     else divide_string_where_different(original,returned, i +1)
 | 
|  |     34 |     
 | 
|  |     35 | }
 | 
| 211 |     36 | 
 | 
|  |     37 | //(2) The function occurrences calculates the number of times  
 | 
|  |     38 | //    strings occur in a list of strings. These occurrences should 
 | 
|  |     39 | //    be calculated as a Map from strings to integers.
 | 
|  |     40 | 
 | 
| 320 |     41 | 
 | 
|  |     42 | def occurrences(xs: List[String]): Map[String, Int] = {
 | 
|  |     43 |     val lst = xs.distinct
 | 
|  |     44 |     val word_pairs = (for (word <- lst) yield (word, xs.count(_==word))).toList
 | 
|  |     45 |     word_pairs.toMap
 | 
|  |     46 | }
 | 
|  |     47 | 
 | 
|  |     48 | 
 | 
| 211 |     49 | 
 | 
|  |     50 | //(3) This functions calculates the dot-product of two documents
 | 
|  |     51 | //    (list of strings). For this it calculates the occurrence
 | 
|  |     52 | //    maps from (2) and then multiplies the corresponding occurrences. 
 | 
|  |     53 | //    If a string does not occur in a document, the product is zero.
 | 
|  |     54 | //    The function finally sums up all products. 
 | 
|  |     55 | 
 | 
| 320 |     56 | 
 | 
| 211 |     57 | def prod(lst1: List[String], lst2: List[String]) : Int = {
 | 
| 320 |     58 |     val map1 = occurrences(lst1)
 | 
|  |     59 |     val map2 = occurrences(lst2)
 | 
|  |     60 |     print(s"map1 is $map1 \n and map2 is $map2")
 | 
|  |     61 |     val pairs = (for(pair1 <- map1 if(map2.get(pair1._1) != None)) yield (pair1._2, map2.get(pair1._1).get)).toList
 | 
|  |     62 |     print(s"\n pairs are $pairs")
 | 
|  |     63 |     val products = (for(pair <- pairs) yield pair._1 * pair._2).toList
 | 
|  |     64 |     products.sum
 | 
|  |     65 | 
 | 
| 211 |     66 | }
 | 
|  |     67 | 
 | 
| 320 |     68 | 
 | 
| 211 |     69 | //(4) Complete the functions overlap and similarity. The overlap of
 | 
|  |     70 | //    two documents is calculated by the formula given in the assignment
 | 
|  |     71 | //    description. The similarity of two strings is given by the overlap
 | 
| 320 |     72 | //    of the cleaned strings (see (1)).  
 | 
| 211 |     73 | 
 | 
|  |     74 | 
 | 
| 320 |     75 | //def overlap(lst1: List[String], lst2: List[String]) : Double = ...
 | 
|  |     76 | 
 | 
|  |     77 | //def similarity(s1: String, s2: String) : Double = ...
 | 
|  |     78 | 
 | 
|  |     79 | 
 | 
|  |     80 | 
 | 
|  |     81 | 
 | 
|  |     82 | /* Test cases
 | 
| 211 |     83 | 
 | 
|  |     84 | 
 | 
|  |     85 | val list1 = List("a", "b", "b", "c", "d") 
 | 
|  |     86 | val list2 = List("d", "b", "d", "b", "d")
 | 
|  |     87 | 
 | 
|  |     88 | occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1)
 | 
|  |     89 | occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2)
 | 
|  |     90 | 
 | 
|  |     91 | prod(list1,list2) // 7 
 | 
| 320 |     92 | prod(list1,list1)
 | 
|  |     93 | prod(list2,list2)
 | 
| 211 |     94 | 
 | 
|  |     95 | overlap(list1, list2)   // 0.5384615384615384
 | 
|  |     96 | overlap(list2, list1)   // 0.5384615384615384
 | 
|  |     97 | overlap(list1, list1)   // 1.0
 | 
|  |     98 | overlap(list2, list2)   // 1.0
 | 
|  |     99 | 
 | 
|  |    100 | // Plagiarism examples from 
 | 
|  |    101 | // https://desales.libguides.com/avoidingplagiarism/examples
 | 
|  |    102 | 
 | 
|  |    103 | val orig1 = """There is a strong market demand for eco-tourism in
 | 
|  |    104 | Australia. Its rich and diverse natural heritage ensures Australia's
 | 
|  |    105 | capacity to attract international ecotourists and gives Australia a
 | 
|  |    106 | comparative advantage in the highly competitive tourism industry."""
 | 
|  |    107 | 
 | 
|  |    108 | val plag1 = """There is a high market demand for eco-tourism in
 | 
|  |    109 | Australia. Australia has a comparative advantage in the highly
 | 
|  |    110 | competitive tourism industry due to its rich and varied natural
 | 
|  |    111 | heritage which ensures Australia's capacity to attract international
 | 
|  |    112 | ecotourists."""
 | 
|  |    113 | 
 | 
| 320 |    114 | similarity(orig1, plag1) // 0.8679245283018868
 | 
| 211 |    115 | 
 | 
|  |    116 | 
 | 
|  |    117 | // Plagiarism examples from 
 | 
|  |    118 | // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
 | 
|  |    119 | 
 | 
|  |    120 | val orig2 = """No oil spill is entirely benign. Depending on timing and
 | 
|  |    121 | location, even a relatively minor spill can cause significant harm to
 | 
|  |    122 | individual organisms and entire populations. Oil spills can cause
 | 
|  |    123 | impacts over a range of time scales, from days to years, or even
 | 
|  |    124 | decades for certain spills. Impacts are typically divided into acute
 | 
|  |    125 | (short-term) and chronic (long-term) effects. Both types are part of a
 | 
|  |    126 | complicated and often controversial equation that is addressed after
 | 
|  |    127 | an oil spill: ecosystem recovery."""
 | 
|  |    128 | 
 | 
|  |    129 | val plag2 = """There is no such thing as a "good" oil spill. If the
 | 
|  |    130 | time and place are just right, even a small oil spill can cause damage
 | 
|  |    131 | to sensitive ecosystems. Further, spills can cause harm days, months,
 | 
|  |    132 | years, or even decades after they occur. Because of this, spills are
 | 
|  |    133 | usually broken into short-term (acute) and long-term (chronic)
 | 
|  |    134 | effects. Both of these types of harm must be addressed in ecosystem
 | 
|  |    135 | recovery: a controversial tactic that is often implemented immediately
 | 
|  |    136 | following an oil spill."""
 | 
|  |    137 | 
 | 
| 320 |    138 | overlap(clean(orig2), clean(plag2))  // 0.728
 | 
|  |    139 | similarity(orig2, plag2)             // 0.728
 | 
|  |    140 | 
 | 
| 211 |    141 | 
 | 
| 320 |    142 |  
 | 
| 211 |    143 | // The punchline: everything above 0.6 looks suspicious and 
 | 
| 320 |    144 | // should be investigated by staff.
 | 
| 211 |    145 | 
 | 
|  |    146 | */
 | 
|  |    147 | 
 | 
| 320 |    148 | }
 | 
| 211 |    149 | 
 |