testing2/docdiff.scala
changeset 323 1f8005b4cdf6
parent 320 cdfb2ce30a3d
equal deleted inserted replaced
322:755d165633ec 323:1f8005b4cdf6
     1 // Preliminary Part about Code Similarity
     1 // Preliminary Part about Code Similarity
     2 //========================================
     2 //========================================
     3 
     3 
     4 
     4 object CW7a {
     5 object CW7a { 
       
     6 
     5 
     7 
     6 
     8 //(1) Complete the clean function below. It should find
     7 //(1) Complete the clean function below. It should find
     9 //    all words in a string using the regular expression
     8 //    all words in a string using the regular expression
    10 //    \w+  and the library function 
     9 //    \w+  and the library function
    11 //
    10 //
    12 //         some_regex.findAllIn(some_string)
    11 //         some_regex.findAllIn(some_string)
    13 //
    12 //
    14 //    The words should be Returned as a list of strings.
    13 //    The words should be Returned as a list of strings.
    15 
    14 
    16 
    15 
    17 def clean(s: String) : List[String] = {
    16 //def clean(s: String) : List[String] = ...
    18     val regex = """\w+""".r;
    17 def clean(s: String) : List[String] =
    19     val list_of_words = s.split(" ").toList
    18     "\\w+".r.findAllIn(s).toList
    20     for(word <- list_of_words;
       
    21         actual_word <- divide_string_where_different(word, regex.findAllIn(word).mkString, 0)) yield actual_word
       
    22 }
       
    23 
    19 
    24 /*
    20 //(2) The function occurrences calculates the number of times
    25     A secondary function that takes as parameters @param original which is the original word, @param returned which is thea word after the process of removing 
    21 //    strings occur in a list of strings. These occurrences should
    26     some characters not allowed by a regular expression, and @param i which is the index where to start compare the characters of the two words.
       
    27     It @return a List of strings which represents all the substrings of returned which were previously divided by characters not allowed by the regular expression applied on it.
       
    28 */
       
    29 def divide_string_where_different(original: String, returned: String, i : Int): List[String] ={
       
    30     val max_i = original.length -1
       
    31     if(original(i) != returned(i)) returned.substring(0, i)::divide_string_where_different(original.substring(i+1), returned.substring(i), 0).filter(_.nonEmpty)
       
    32     else if (i == max_i) List(returned)
       
    33     else divide_string_where_different(original,returned, i +1)
       
    34     
       
    35 }
       
    36 
       
    37 //(2) The function occurrences calculates the number of times  
       
    38 //    strings occur in a list of strings. These occurrences should 
       
    39 //    be calculated as a Map from strings to integers.
    22 //    be calculated as a Map from strings to integers.
    40 
    23 
    41 
    24 
    42 def occurrences(xs: List[String]): Map[String, Int] = {
    25 //def occurrences(xs: List[String]): Map[String, Int] = ..
    43     val lst = xs.distinct
    26 def occurrences(xs: List[String]) : Map[String, Int] =
    44     val word_pairs = (for (word <- lst) yield (word, xs.count(_==word))).toList
    27     xs.groupBy(identity).view.mapValues(_.size).toMap
    45     word_pairs.toMap
       
    46 }
       
    47 
       
    48 
       
    49 
    28 
    50 //(3) This functions calculates the dot-product of two documents
    29 //(3) This functions calculates the dot-product of two documents
    51 //    (list of strings). For this it calculates the occurrence
    30 //    (list of strings). For this it calculates the occurrence
    52 //    maps from (2) and then multiplies the corresponding occurrences. 
    31 //    maps from (2) and then multiplies the corresponding occurrences.
    53 //    If a string does not occur in a document, the product is zero.
    32 //    If a string does not occur in a document, the product is zero.
    54 //    The function finally sums up all products. 
    33 //    The function finally sums up all products.
    55 
    34 
    56 
    35 
    57 def prod(lst1: List[String], lst2: List[String]) : Int = {
    36 //def prod(lst1: List[String], lst2: List[String]) : Int = ..
    58     val map1 = occurrences(lst1)
    37 def prod(lst1: List[String], lst2: List[String]) : Int =
    59     val map2 = occurrences(lst2)
    38     occurrences(lst1).map(x => occurrences(lst2).getOrElse(x._1, 0) * x._2).reduce(_ + _)
    60     print(s"map1 is $map1 \n and map2 is $map2")
       
    61     val pairs = (for(pair1 <- map1 if(map2.get(pair1._1) != None)) yield (pair1._2, map2.get(pair1._1).get)).toList
       
    62     print(s"\n pairs are $pairs")
       
    63     val products = (for(pair <- pairs) yield pair._1 * pair._2).toList
       
    64     products.sum
       
    65 
       
    66 }
       
    67 
       
    68 
    39 
    69 //(4) Complete the functions overlap and similarity. The overlap of
    40 //(4) Complete the functions overlap and similarity. The overlap of
    70 //    two documents is calculated by the formula given in the assignment
    41 //    two documents is calculated by the formula given in the assignment
    71 //    description. The similarity of two strings is given by the overlap
    42 //    description. The similarity of two strings is given by the overlap
    72 //    of the cleaned strings (see (1)).  
    43 //    of the cleaned strings (see (1)).
    73 
    44 
    74 
    45 
    75 //def overlap(lst1: List[String], lst2: List[String]) : Double = ...
    46 //def overlap(lst1: List[String], lst2: List[String]) : Double = ...
    76 
    47 def overlap(lst1: List[String], lst2: List[String]) : Double =
       
    48     prod(lst1, lst2).toDouble/Math.max(prod(lst1, lst1).toDouble, prod(lst2, lst2).toDouble)
    77 //def similarity(s1: String, s2: String) : Double = ...
    49 //def similarity(s1: String, s2: String) : Double = ...
    78 
    50 def similarity(s1: String, s2: String) : Double =
    79 
    51     overlap(clean(s1), clean(s2))
    80 
    52 
    81 
    53 
    82 /* Test cases
    54 /* Test cases
    83 
    55 import CW7a._
    84 
    56 val list1 = List("a", "b", "b", "c", "d")
    85 val list1 = List("a", "b", "b", "c", "d") 
       
    86 val list2 = List("d", "b", "d", "b", "d")
    57 val list2 = List("d", "b", "d", "b", "d")
    87 
    58 occurrences(List("a", "b", "b", "c", "d"))
    88 occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1)
    59 occurrences(List("d", "b", "d", "b", "d"))
    89 occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2)
    60 prod(list1,list2) // 7
    90 
       
    91 prod(list1,list2) // 7 
       
    92 prod(list1,list1)
       
    93 prod(list2,list2)
       
    94 
       
    95 overlap(list1, list2)   // 0.5384615384615384
    61 overlap(list1, list2)   // 0.5384615384615384
    96 overlap(list2, list1)   // 0.5384615384615384
    62 overlap(list2, list1)   // 0.5384615384615384
    97 overlap(list1, list1)   // 1.0
    63 overlap(list1, list1)   // 1.0
    98 overlap(list2, list2)   // 1.0
    64 overlap(list2, list2)   // 1.0
    99 
    65 // Plagiarism examples from
   100 // Plagiarism examples from 
       
   101 // https://desales.libguides.com/avoidingplagiarism/examples
    66 // https://desales.libguides.com/avoidingplagiarism/examples
   102 
       
   103 val orig1 = """There is a strong market demand for eco-tourism in
    67 val orig1 = """There is a strong market demand for eco-tourism in
   104 Australia. Its rich and diverse natural heritage ensures Australia's
    68 Australia. Its rich and diverse natural heritage ensures Australia's
   105 capacity to attract international ecotourists and gives Australia a
    69 capacity to attract international ecotourists and gives Australia a
   106 comparative advantage in the highly competitive tourism industry."""
    70 comparative advantage in the highly competitive tourism industry."""
   107 
       
   108 val plag1 = """There is a high market demand for eco-tourism in
    71 val plag1 = """There is a high market demand for eco-tourism in
   109 Australia. Australia has a comparative advantage in the highly
    72 Australia. Australia has a comparative advantage in the highly
   110 competitive tourism industry due to its rich and varied natural
    73 competitive tourism industry due to its rich and varied natural
   111 heritage which ensures Australia's capacity to attract international
    74 heritage which ensures Australia's capacity to attract international
   112 ecotourists."""
    75 ecotourists."""
   113 
       
   114 similarity(orig1, plag1) // 0.8679245283018868
    76 similarity(orig1, plag1) // 0.8679245283018868
   115 
    77 // Plagiarism examples from
   116 
       
   117 // Plagiarism examples from 
       
   118 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
    78 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
   119 
       
   120 val orig2 = """No oil spill is entirely benign. Depending on timing and
    79 val orig2 = """No oil spill is entirely benign. Depending on timing and
   121 location, even a relatively minor spill can cause significant harm to
    80 location, even a relatively minor spill can cause significant harm to
   122 individual organisms and entire populations. Oil spills can cause
    81 individual organisms and entire populations. Oil spills can cause
   123 impacts over a range of time scales, from days to years, or even
    82 impacts over a range of time scales, from days to years, or even
   124 decades for certain spills. Impacts are typically divided into acute
    83 decades for certain spills. Impacts are typically divided into acute
   125 (short-term) and chronic (long-term) effects. Both types are part of a
    84 (short-term) and chronic (long-term) effects. Both types are part of a
   126 complicated and often controversial equation that is addressed after
    85 complicated and often controversial equation that is addressed after
   127 an oil spill: ecosystem recovery."""
    86 an oil spill: ecosystem recovery."""
   128 
       
   129 val plag2 = """There is no such thing as a "good" oil spill. If the
    87 val plag2 = """There is no such thing as a "good" oil spill. If the
   130 time and place are just right, even a small oil spill can cause damage
    88 time and place are just right, even a small oil spill can cause damage
   131 to sensitive ecosystems. Further, spills can cause harm days, months,
    89 to sensitive ecosystems. Further, spills can cause harm days, months,
   132 years, or even decades after they occur. Because of this, spills are
    90 years, or even decades after they occur. Because of this, spills are
   133 usually broken into short-term (acute) and long-term (chronic)
    91 usually broken into short-term (acute) and long-term (chronic)
   134 effects. Both of these types of harm must be addressed in ecosystem
    92 effects. Both of these types of harm must be addressed in ecosystem
   135 recovery: a controversial tactic that is often implemented immediately
    93 recovery: a controversial tactic that is often implemented immediately
   136 following an oil spill."""
    94 following an oil spill."""
   137 
       
   138 overlap(clean(orig2), clean(plag2))  // 0.728
    95 overlap(clean(orig2), clean(plag2))  // 0.728
   139 similarity(orig2, plag2)             // 0.728
    96 similarity(orig2, plag2)             // 0.728
   140 
    97 // The punchline: everything above 0.6 looks suspicious and
   141 
       
   142  
       
   143 // The punchline: everything above 0.6 looks suspicious and 
       
   144 // should be investigated by staff.
    98 // should be investigated by staff.
   145 
       
   146 */
    99 */
   147 
   100 
   148 }
   101 }
   149