pep-material: comparison testing2/docdiff.scala

equal deleted inserted replaced

-:755d165633ec
+:1f8005b4cdf6
 // Preliminary Part about Code Similarity
 //========================================
+object CW7a {
-object CW7a {
 //(1) Complete the clean function below. It should find
 //    all words in a string using the regular expression
 //    \w+  and the library function
 //
 //         some_regex.findAllIn(some_string)
 //
 //    The words should be Returned as a list of strings.
-def clean(s: String) : List[String] = {
+//def clean(s: String) : List[String] = ...
-val regex = """\w+""".r;
+def clean(s: String) : List[String] =
-val list_of_words = s.split(" ").toList
+"\\w+".r.findAllIn(s).toList
-for(word <- list_of_words;
-actual_word <- divide_string_where_different(word, regex.findAllIn(word).mkString, 0)) yield actual_word
-}
-/*
+//(2) The function occurrences calculates the number of times
-A secondary function that takes as parameters @param original which is the original word, @param returned which is thea word after the process of removing
+//    strings occur in a list of strings. These occurrences should
-some characters not allowed by a regular expression, and @param i which is the index where to start compare the characters of the two words.
-It @return a List of strings which represents all the substrings of returned which were previously divided by characters not allowed by the regular expression applied on it.
-*/
-def divide_string_where_different(original: String, returned: String, i : Int): List[String] ={
-val max_i = original.length -1
-if(original(i) != returned(i)) returned.substring(0, i)::divide_string_where_different(original.substring(i+1), returned.substring(i), 0).filter(_.nonEmpty)
-else if (i == max_i) List(returned)
-else divide_string_where_different(original,returned, i +1)
-}
-//(2) The function occurrences calculates the number of times
-//    strings occur in a list of strings. These occurrences should
 //    be calculated as a Map from strings to integers.
-def occurrences(xs: List[String]): Map[String, Int] = {
+//def occurrences(xs: List[String]): Map[String, Int] = ..
-val lst = xs.distinct
+def occurrences(xs: List[String]) : Map[String, Int] =
-val word_pairs = (for (word <- lst) yield (word, xs.count(_==word))).toList
+xs.groupBy(identity).view.mapValues(_.size).toMap
-word_pairs.toMap
-}
 //(3) This functions calculates the dot-product of two documents
 //    (list of strings). For this it calculates the occurrence
 //    maps from (2) and then multiplies the corresponding occurrences.
 //    If a string does not occur in a document, the product is zero.
 //    The function finally sums up all products.
-def prod(lst1: List[String], lst2: List[String]) : Int = {
+//def prod(lst1: List[String], lst2: List[String]) : Int = ..
-val map1 = occurrences(lst1)
+def prod(lst1: List[String], lst2: List[String]) : Int =
-val map2 = occurrences(lst2)
+occurrences(lst1).map(x => occurrences(lst2).getOrElse(x._1, 0) * x._2).reduce(_ + _)
-print(s"map1 is $map1 \n and map2 is $map2")
-val pairs = (for(pair1 <- map1 if(map2.get(pair1._1) != None)) yield (pair1._2, map2.get(pair1._1).get)).toList
-print(s"\n pairs are $pairs")
-val products = (for(pair <- pairs) yield pair._1 * pair._2).toList
-products.sum
-}
 //(4) Complete the functions overlap and similarity. The overlap of
 //    two documents is calculated by the formula given in the assignment
 //    description. The similarity of two strings is given by the overlap
 //    of the cleaned strings (see (1)).
 //def overlap(lst1: List[String], lst2: List[String]) : Double = ...
+def overlap(lst1: List[String], lst2: List[String]) : Double =
+prod(lst1, lst2).toDouble/Math.max(prod(lst1, lst1).toDouble, prod(lst2, lst2).toDouble)
 //def similarity(s1: String, s2: String) : Double = ...
+def similarity(s1: String, s2: String) : Double =
+overlap(clean(s1), clean(s2))
 /* Test cases
+import CW7a._
+val list1 = List("a", "b", "b", "c", "d")
-val list1 = List("a", "b", "b", "c", "d")
 val list2 = List("d", "b", "d", "b", "d")
+occurrences(List("a", "b", "b", "c", "d"))
-occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1)
+occurrences(List("d", "b", "d", "b", "d"))
-occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2)
+prod(list1,list2) // 7
-prod(list1,list2) // 7
-prod(list1,list1)
-prod(list2,list2)
 overlap(list1, list2)   // 0.5384615384615384
 overlap(list2, list1)   // 0.5384615384615384
 overlap(list1, list1)   // 1.0
 overlap(list2, list2)   // 1.0
+// Plagiarism examples from
-// Plagiarism examples from
 // https://desales.libguides.com/avoidingplagiarism/examples
 val orig1 = """There is a strong market demand for eco-tourism in
 Australia. Its rich and diverse natural heritage ensures Australia's
 capacity to attract international ecotourists and gives Australia a
 comparative advantage in the highly competitive tourism industry."""
 val plag1 = """There is a high market demand for eco-tourism in
 Australia. Australia has a comparative advantage in the highly
 competitive tourism industry due to its rich and varied natural
 heritage which ensures Australia's capacity to attract international
 ecotourists."""
 similarity(orig1, plag1) // 0.8679245283018868
+// Plagiarism examples from
-// Plagiarism examples from
 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
 val orig2 = """No oil spill is entirely benign. Depending on timing and
 location, even a relatively minor spill can cause significant harm to
 individual organisms and entire populations. Oil spills can cause
 impacts over a range of time scales, from days to years, or even
 decades for certain spills. Impacts are typically divided into acute
 (short-term) and chronic (long-term) effects. Both types are part of a
 complicated and often controversial equation that is addressed after
 an oil spill: ecosystem recovery."""
 val plag2 = """There is no such thing as a "good" oil spill. If the
 time and place are just right, even a small oil spill can cause damage
 to sensitive ecosystems. Further, spills can cause harm days, months,
 years, or even decades after they occur. Because of this, spills are
 usually broken into short-term (acute) and long-term (chronic)
 effects. Both of these types of harm must be addressed in ecosystem
 recovery: a controversial tactic that is often implemented immediately
 following an oil spill."""
 overlap(clean(orig2), clean(plag2))  // 0.728
 similarity(orig2, plag2)             // 0.728
+// The punchline: everything above 0.6 looks suspicious and
-// The punchline: everything above 0.6 looks suspicious and
 // should be investigated by staff.
 */
 }

changeset 323	1f8005b4cdf6
parent 320	cdfb2ce30a3d