| 
283
 | 
     1  | 
// Preliminary Part about Code Similarity
  | 
| 
 | 
     2  | 
//========================================
  | 
| 
211
 | 
     3  | 
  | 
| 
 | 
     4  | 
  | 
| 
320
 | 
     5  | 
object CW7a { 
 | 
| 
 | 
     6  | 
  | 
| 
211
 | 
     7  | 
  | 
| 
 | 
     8  | 
//(1) Complete the clean function below. It should find
  | 
| 
 | 
     9  | 
//    all words in a string using the regular expression
  | 
| 
 | 
    10  | 
//    \w+  and the library function 
  | 
| 
 | 
    11  | 
//
  | 
| 
 | 
    12  | 
//         some_regex.findAllIn(some_string)
  | 
| 
 | 
    13  | 
//
  | 
| 
 | 
    14  | 
//    The words should be Returned as a list of strings.
  | 
| 
 | 
    15  | 
  | 
| 
320
 | 
    16  | 
  | 
| 
 | 
    17  | 
def clean(s: String) : List[String] = {
 | 
| 
 | 
    18  | 
    val regex = """\w+""".r;
  | 
| 
 | 
    19  | 
    val list_of_words = s.split(" ").toList
 | 
| 
 | 
    20  | 
    for(word <- list_of_words;
  | 
| 
 | 
    21  | 
        actual_word <- divide_string_where_different(word, regex.findAllIn(word).mkString, 0)) yield actual_word
  | 
| 
 | 
    22  | 
}
  | 
| 
211
 | 
    23  | 
  | 
| 
320
 | 
    24  | 
/*
  | 
| 
 | 
    25  | 
    A secondary function that takes as parameters @param original which is the original word, @param returned which is thea word after the process of removing 
  | 
| 
 | 
    26  | 
    some characters not allowed by a regular expression, and @param i which is the index where to start compare the characters of the two words.
  | 
| 
 | 
    27  | 
    It @return a List of strings which represents all the substrings of returned which were previously divided by characters not allowed by the regular expression applied on it.
  | 
| 
 | 
    28  | 
*/
  | 
| 
 | 
    29  | 
def divide_string_where_different(original: String, returned: String, i : Int): List[String] ={
 | 
| 
 | 
    30  | 
    val max_i = original.length -1
  | 
| 
 | 
    31  | 
    if(original(i) != returned(i)) returned.substring(0, i)::divide_string_where_different(original.substring(i+1), returned.substring(i), 0).filter(_.nonEmpty)
  | 
| 
 | 
    32  | 
    else if (i == max_i) List(returned)
  | 
| 
 | 
    33  | 
    else divide_string_where_different(original,returned, i +1)
  | 
| 
 | 
    34  | 
    
  | 
| 
 | 
    35  | 
}
  | 
| 
211
 | 
    36  | 
  | 
| 
 | 
    37  | 
//(2) The function occurrences calculates the number of times  
  | 
| 
 | 
    38  | 
//    strings occur in a list of strings. These occurrences should 
  | 
| 
 | 
    39  | 
//    be calculated as a Map from strings to integers.
  | 
| 
 | 
    40  | 
  | 
| 
320
 | 
    41  | 
  | 
| 
 | 
    42  | 
def occurrences(xs: List[String]): Map[String, Int] = {
 | 
| 
 | 
    43  | 
    val lst = xs.distinct
  | 
| 
 | 
    44  | 
    val word_pairs = (for (word <- lst) yield (word, xs.count(_==word))).toList
  | 
| 
 | 
    45  | 
    word_pairs.toMap
  | 
| 
 | 
    46  | 
}
  | 
| 
 | 
    47  | 
  | 
| 
 | 
    48  | 
  | 
| 
211
 | 
    49  | 
  | 
| 
 | 
    50  | 
//(3) This functions calculates the dot-product of two documents
  | 
| 
 | 
    51  | 
//    (list of strings). For this it calculates the occurrence
  | 
| 
 | 
    52  | 
//    maps from (2) and then multiplies the corresponding occurrences. 
  | 
| 
 | 
    53  | 
//    If a string does not occur in a document, the product is zero.
  | 
| 
 | 
    54  | 
//    The function finally sums up all products. 
  | 
| 
 | 
    55  | 
  | 
| 
320
 | 
    56  | 
  | 
| 
211
 | 
    57  | 
def prod(lst1: List[String], lst2: List[String]) : Int = {
 | 
| 
320
 | 
    58  | 
    val map1 = occurrences(lst1)
  | 
| 
 | 
    59  | 
    val map2 = occurrences(lst2)
  | 
| 
 | 
    60  | 
    print(s"map1 is $map1 \n and map2 is $map2")
  | 
| 
 | 
    61  | 
    val pairs = (for(pair1 <- map1 if(map2.get(pair1._1) != None)) yield (pair1._2, map2.get(pair1._1).get)).toList
  | 
| 
 | 
    62  | 
    print(s"\n pairs are $pairs")
  | 
| 
 | 
    63  | 
    val products = (for(pair <- pairs) yield pair._1 * pair._2).toList
  | 
| 
 | 
    64  | 
    products.sum
  | 
| 
 | 
    65  | 
  | 
| 
211
 | 
    66  | 
}
  | 
| 
 | 
    67  | 
  | 
| 
320
 | 
    68  | 
  | 
| 
211
 | 
    69  | 
//(4) Complete the functions overlap and similarity. The overlap of
  | 
| 
 | 
    70  | 
//    two documents is calculated by the formula given in the assignment
  | 
| 
 | 
    71  | 
//    description. The similarity of two strings is given by the overlap
  | 
| 
320
 | 
    72  | 
//    of the cleaned strings (see (1)).  
  | 
| 
211
 | 
    73  | 
  | 
| 
 | 
    74  | 
  | 
| 
320
 | 
    75  | 
//def overlap(lst1: List[String], lst2: List[String]) : Double = ...
  | 
| 
 | 
    76  | 
  | 
| 
 | 
    77  | 
//def similarity(s1: String, s2: String) : Double = ...
  | 
| 
 | 
    78  | 
  | 
| 
 | 
    79  | 
  | 
| 
 | 
    80  | 
  | 
| 
 | 
    81  | 
  | 
| 
 | 
    82  | 
/* Test cases
  | 
| 
211
 | 
    83  | 
  | 
| 
 | 
    84  | 
  | 
| 
 | 
    85  | 
val list1 = List("a", "b", "b", "c", "d") 
 | 
| 
 | 
    86  | 
val list2 = List("d", "b", "d", "b", "d")
 | 
| 
 | 
    87  | 
  | 
| 
 | 
    88  | 
occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1)
 | 
| 
 | 
    89  | 
occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2)
 | 
| 
 | 
    90  | 
  | 
| 
 | 
    91  | 
prod(list1,list2) // 7 
  | 
| 
320
 | 
    92  | 
prod(list1,list1)
  | 
| 
 | 
    93  | 
prod(list2,list2)
  | 
| 
211
 | 
    94  | 
  | 
| 
 | 
    95  | 
overlap(list1, list2)   // 0.5384615384615384
  | 
| 
 | 
    96  | 
overlap(list2, list1)   // 0.5384615384615384
  | 
| 
 | 
    97  | 
overlap(list1, list1)   // 1.0
  | 
| 
 | 
    98  | 
overlap(list2, list2)   // 1.0
  | 
| 
 | 
    99  | 
  | 
| 
 | 
   100  | 
// Plagiarism examples from 
  | 
| 
 | 
   101  | 
// https://desales.libguides.com/avoidingplagiarism/examples
  | 
| 
 | 
   102  | 
  | 
| 
 | 
   103  | 
val orig1 = """There is a strong market demand for eco-tourism in
  | 
| 
 | 
   104  | 
Australia. Its rich and diverse natural heritage ensures Australia's
  | 
| 
 | 
   105  | 
capacity to attract international ecotourists and gives Australia a
  | 
| 
 | 
   106  | 
comparative advantage in the highly competitive tourism industry."""
  | 
| 
 | 
   107  | 
  | 
| 
 | 
   108  | 
val plag1 = """There is a high market demand for eco-tourism in
  | 
| 
 | 
   109  | 
Australia. Australia has a comparative advantage in the highly
  | 
| 
 | 
   110  | 
competitive tourism industry due to its rich and varied natural
  | 
| 
 | 
   111  | 
heritage which ensures Australia's capacity to attract international
  | 
| 
 | 
   112  | 
ecotourists."""
  | 
| 
 | 
   113  | 
  | 
| 
320
 | 
   114  | 
similarity(orig1, plag1) // 0.8679245283018868
  | 
| 
211
 | 
   115  | 
  | 
| 
 | 
   116  | 
  | 
| 
 | 
   117  | 
// Plagiarism examples from 
  | 
| 
 | 
   118  | 
// https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
  | 
| 
 | 
   119  | 
  | 
| 
 | 
   120  | 
val orig2 = """No oil spill is entirely benign. Depending on timing and
  | 
| 
 | 
   121  | 
location, even a relatively minor spill can cause significant harm to
  | 
| 
 | 
   122  | 
individual organisms and entire populations. Oil spills can cause
  | 
| 
 | 
   123  | 
impacts over a range of time scales, from days to years, or even
  | 
| 
 | 
   124  | 
decades for certain spills. Impacts are typically divided into acute
  | 
| 
 | 
   125  | 
(short-term) and chronic (long-term) effects. Both types are part of a
  | 
| 
 | 
   126  | 
complicated and often controversial equation that is addressed after
  | 
| 
 | 
   127  | 
an oil spill: ecosystem recovery."""
  | 
| 
 | 
   128  | 
  | 
| 
 | 
   129  | 
val plag2 = """There is no such thing as a "good" oil spill. If the
  | 
| 
 | 
   130  | 
time and place are just right, even a small oil spill can cause damage
  | 
| 
 | 
   131  | 
to sensitive ecosystems. Further, spills can cause harm days, months,
  | 
| 
 | 
   132  | 
years, or even decades after they occur. Because of this, spills are
  | 
| 
 | 
   133  | 
usually broken into short-term (acute) and long-term (chronic)
  | 
| 
 | 
   134  | 
effects. Both of these types of harm must be addressed in ecosystem
  | 
| 
 | 
   135  | 
recovery: a controversial tactic that is often implemented immediately
  | 
| 
 | 
   136  | 
following an oil spill."""
  | 
| 
 | 
   137  | 
  | 
| 
320
 | 
   138  | 
overlap(clean(orig2), clean(plag2))  // 0.728
  | 
| 
 | 
   139  | 
similarity(orig2, plag2)             // 0.728
  | 
| 
 | 
   140  | 
  | 
| 
211
 | 
   141  | 
  | 
| 
320
 | 
   142  | 
 
  | 
| 
211
 | 
   143  | 
// The punchline: everything above 0.6 looks suspicious and 
  | 
| 
320
 | 
   144  | 
// should be investigated by staff.
  | 
| 
211
 | 
   145  | 
  | 
| 
 | 
   146  | 
*/
  | 
| 
 | 
   147  | 
  | 
| 
320
 | 
   148  | 
}
  | 
| 
211
 | 
   149  | 
  |