core_testing2/docdiff.scala
changeset 481 e03a0100ec46
parent 401 9471c3b7ea02
equal deleted inserted replaced
480:a623dd1f2898 481:e03a0100ec46
     1 // Preliminary Part about Code Similarity
     1 // Core Part 2 about Code Similarity
     2 //========================================
     2 //===================================
     3 
     3 
     4 
     4 
     5 object C2 { 
     5 object C2 { 
     6 
     6 
     7 //(1) Complete the clean function below. It should find
     7 // ADD YOUR CODE BELOW
     8 //    all words in a string using the regular expression
     8 //======================
     9 //    \w+  and the library function 
       
    10 //
       
    11 //         some_regex.findAllIn(some_string)
       
    12 //
       
    13 //    The words should be Returned as a list of strings.
       
    14 
     9 
    15 def clean(s: String) : List[String] = 
    10 //(1)
    16   ("""\w+""".r).findAllIn(s).toList
    11 def clean(s: String) : List[String] = """(\w+)""".r.findAllIn(s).toList
       
    12   
    17 
    13 
    18 
    14 
    19 //(2) The function occurrences calculates the number of times  
    15 //(2)
    20 //    strings occur in a list of strings. These occurrences should 
    16 def occurrences(xs: List[String]): Map[String, Int] = {
    21 //    be calculated as a Map from strings to integers.
    17     val ls = xs.distinct
    22 
    18     val occLs = for (s <- ls) yield (s, xs.count(_.equals(s)))
    23 def occurrences(xs: List[String]): Map[String, Int] =
    19     occLs.toMap
    24   (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap
       
    25 
       
    26 //(3) This functions calculates the dot-product of two documents
       
    27 //    (list of strings). For this it calculates the occurrence
       
    28 //    maps from (2) and then multiplies the corresponding occurrences. 
       
    29 //    If a string does not occur in a document, the product is zero.
       
    30 //    The function finally sums up all products. 
       
    31 
       
    32 def prod(lst1: List[String], lst2: List[String]) : Int = {
       
    33     val words = (lst1 ::: lst2).distinct
       
    34     val occs1 = occurrences(lst1)
       
    35     val occs2 = occurrences(lst2)
       
    36     words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum
       
    37 }          
       
    38 
       
    39 //(4) Complete the functions overlap and similarity. The overlap of
       
    40 //    two documents is calculated by the formula given in the assignment
       
    41 //    description. The similarity of two strings is given by the overlap
       
    42 //    of the cleaned (see (1)) strings.  
       
    43 
       
    44 def overlap(lst1: List[String], lst2: List[String]) : Double = {
       
    45     val m1 = prod(lst1, lst1)
       
    46     val m2 = prod(lst2, lst2) 
       
    47     prod(lst1, lst2).toDouble / (List(m1, m2).max)
       
    48 }
    20 }
    49 
    21 
    50 def similarity(s1: String, s2: String) : Double =
    22 
    51   overlap(clean(s1), clean(s2))
    23 //(3)
       
    24 def prod(lst1: List[String], lst2: List[String]) : Int = {
       
    25     val occM1 = occurrences(lst1)
       
    26     val occM2 = occurrences(lst2)
       
    27     (for (s <- occM1) yield s._2 * occM2.getOrElse(s._1,0)).sum
       
    28 }
    52 
    29 
    53 
    30 
    54 /*
    31 //(4)
       
    32 def overlap(lst1: List[String], lst2: List[String]) : Double = prod(lst1,lst2) / prod(lst1,lst1).max(prod(lst2,lst2))
       
    33 
       
    34 def similarity(s1: String, s2: String) : Double = overlap(clean(s1), clean(s2))
       
    35 
       
    36 
       
    37 
       
    38 /* Test cases
    55 
    39 
    56 
    40 
    57 val list1 = List("a", "b", "b", "c", "d") 
    41 val list1 = List("a", "b", "b", "c", "d") 
    58 val list2 = List("d", "b", "d", "b", "d")
    42 val list2 = List("d", "b", "d", "b", "d")
    59 
    43 
    79 Australia. Australia has a comparative advantage in the highly
    63 Australia. Australia has a comparative advantage in the highly
    80 competitive tourism industry due to its rich and varied natural
    64 competitive tourism industry due to its rich and varied natural
    81 heritage which ensures Australia's capacity to attract international
    65 heritage which ensures Australia's capacity to attract international
    82 ecotourists."""
    66 ecotourists."""
    83 
    67 
    84 similarity(orig1, plag1)
    68 similarity(orig1, plag1) // 0.8679245283018868
    85 
    69 
    86 
    70 
    87 // Plagiarism examples from 
    71 // Plagiarism examples from 
    88 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
    72 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
    89 
    73 
   103 usually broken into short-term (acute) and long-term (chronic)
    87 usually broken into short-term (acute) and long-term (chronic)
   104 effects. Both of these types of harm must be addressed in ecosystem
    88 effects. Both of these types of harm must be addressed in ecosystem
   105 recovery: a controversial tactic that is often implemented immediately
    89 recovery: a controversial tactic that is often implemented immediately
   106 following an oil spill."""
    90 following an oil spill."""
   107 
    91 
   108 overlap(clean(orig2), clean(plag2))
    92 overlap(clean(orig2), clean(plag2))  // 0.728
   109 similarity(orig2, plag2)
    93 similarity(orig2, plag2)             // 0.728
   110 
    94 
       
    95 
       
    96  
   111 // The punchline: everything above 0.6 looks suspicious and 
    97 // The punchline: everything above 0.6 looks suspicious and 
   112 // should be looked at by staff.
    98 // should be investigated by staff.
   113 
    99 
   114 */
   100 */
   115 
   101 
   116 
       
   117 }
   102 }