solutions2/docdiff.scala
changeset 346 663c2a9108d1
parent 345 40657f9a4e4a
child 347 4de31fdc0d67
equal deleted inserted replaced
345:40657f9a4e4a 346:663c2a9108d1
     1 // Preliminary Part about Code Similarity
       
     2 //========================================
       
     3 
       
     4 
       
     5 object CW7a { 
       
     6 
       
     7 //(1) Complete the clean function below. It should find
       
     8 //    all words in a string using the regular expression
       
     9 //    \w+  and the library function 
       
    10 //
       
    11 //         some_regex.findAllIn(some_string)
       
    12 //
       
    13 //    The words should be Returned as a list of strings.
       
    14 
       
    15 def clean(s: String) : List[String] = 
       
    16   ("""\w+""".r).findAllIn(s).toList
       
    17 
       
    18 
       
    19 //(2) The function occurrences calculates the number of times  
       
    20 //    strings occur in a list of strings. These occurrences should 
       
    21 //    be calculated as a Map from strings to integers.
       
    22 
       
    23 def occurrences(xs: List[String]): Map[String, Int] =
       
    24   (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap
       
    25 
       
    26 //(3) This functions calculates the dot-product of two documents
       
    27 //    (list of strings). For this it calculates the occurrence
       
    28 //    maps from (2) and then multiplies the corresponding occurrences. 
       
    29 //    If a string does not occur in a document, the product is zero.
       
    30 //    The function finally sums up all products. 
       
    31 
       
    32 def prod(lst1: List[String], lst2: List[String]) : Int = {
       
    33     val words = (lst1 ::: lst2).distinct
       
    34     val occs1 = occurrences(lst1)
       
    35     val occs2 = occurrences(lst2)
       
    36     words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum
       
    37 }
       
    38 
       
    39 //(4) Complete the functions overlap and similarity. The overlap of
       
    40 //    two documents is calculated by the formula given in the assignment
       
    41 //    description. The similarity of two strings is given by the overlap
       
    42 //    of the cleaned (see (1)) strings.  
       
    43 
       
    44 def overlap(lst1: List[String], lst2: List[String]) : Double = {
       
    45     val m1 = prod(lst1, lst1)
       
    46     val m2 = prod(lst2, lst2) 
       
    47     prod(lst1, lst2).toDouble / (List(m1, m2).max)
       
    48 }
       
    49 
       
    50 def similarity(s1: String, s2: String) : Double =
       
    51   overlap(clean(s1), clean(s2))
       
    52 
       
    53 
       
    54 /*
       
    55 
       
    56 
       
    57 val list1 = List("a", "b", "b", "c", "d") 
       
    58 val list2 = List("d", "b", "d", "b", "d")
       
    59 
       
    60 occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1)
       
    61 occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2)
       
    62 
       
    63 prod(list1,list2) // 7 
       
    64 
       
    65 overlap(list1, list2)   // 0.5384615384615384
       
    66 overlap(list2, list1)   // 0.5384615384615384
       
    67 overlap(list1, list1)   // 1.0
       
    68 overlap(list2, list2)   // 1.0
       
    69 
       
    70 // Plagiarism examples from 
       
    71 // https://desales.libguides.com/avoidingplagiarism/examples
       
    72 
       
    73 val orig1 = """There is a strong market demand for eco-tourism in
       
    74 Australia. Its rich and diverse natural heritage ensures Australia's
       
    75 capacity to attract international ecotourists and gives Australia a
       
    76 comparative advantage in the highly competitive tourism industry."""
       
    77 
       
    78 val plag1 = """There is a high market demand for eco-tourism in
       
    79 Australia. Australia has a comparative advantage in the highly
       
    80 competitive tourism industry due to its rich and varied natural
       
    81 heritage which ensures Australia's capacity to attract international
       
    82 ecotourists."""
       
    83 
       
    84 similarity(orig1, plag1)
       
    85 
       
    86 
       
    87 // Plagiarism examples from 
       
    88 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
       
    89 
       
    90 val orig2 = """No oil spill is entirely benign. Depending on timing and
       
    91 location, even a relatively minor spill can cause significant harm to
       
    92 individual organisms and entire populations. Oil spills can cause
       
    93 impacts over a range of time scales, from days to years, or even
       
    94 decades for certain spills. Impacts are typically divided into acute
       
    95 (short-term) and chronic (long-term) effects. Both types are part of a
       
    96 complicated and often controversial equation that is addressed after
       
    97 an oil spill: ecosystem recovery."""
       
    98 
       
    99 val plag2 = """There is no such thing as a "good" oil spill. If the
       
   100 time and place are just right, even a small oil spill can cause damage
       
   101 to sensitive ecosystems. Further, spills can cause harm days, months,
       
   102 years, or even decades after they occur. Because of this, spills are
       
   103 usually broken into short-term (acute) and long-term (chronic)
       
   104 effects. Both of these types of harm must be addressed in ecosystem
       
   105 recovery: a controversial tactic that is often implemented immediately
       
   106 following an oil spill."""
       
   107 
       
   108 overlap(clean(orig2), clean(plag2))
       
   109 similarity(orig2, plag2)
       
   110 
       
   111 // The punchline: everything above 0.6 looks suspicious and 
       
   112 // should be looked at by staff.
       
   113 
       
   114 */
       
   115 
       
   116 
       
   117 }