diff -r 755d165633ec -r 1f8005b4cdf6 testing2/docdiff.scala --- a/testing2/docdiff.scala Tue Nov 19 06:38:20 2019 +0000 +++ b/testing2/docdiff.scala Fri Nov 22 16:41:45 2019 +0000 @@ -1,122 +1,81 @@ // Preliminary Part about Code Similarity //======================================== - -object CW7a { +object CW7a { //(1) Complete the clean function below. It should find // all words in a string using the regular expression -// \w+ and the library function +// \w+ and the library function // // some_regex.findAllIn(some_string) // // The words should be Returned as a list of strings. -def clean(s: String) : List[String] = { - val regex = """\w+""".r; - val list_of_words = s.split(" ").toList - for(word <- list_of_words; - actual_word <- divide_string_where_different(word, regex.findAllIn(word).mkString, 0)) yield actual_word -} +//def clean(s: String) : List[String] = ... +def clean(s: String) : List[String] = + "\\w+".r.findAllIn(s).toList -/* - A secondary function that takes as parameters @param original which is the original word, @param returned which is thea word after the process of removing - some characters not allowed by a regular expression, and @param i which is the index where to start compare the characters of the two words. - It @return a List of strings which represents all the substrings of returned which were previously divided by characters not allowed by the regular expression applied on it. -*/ -def divide_string_where_different(original: String, returned: String, i : Int): List[String] ={ - val max_i = original.length -1 - if(original(i) != returned(i)) returned.substring(0, i)::divide_string_where_different(original.substring(i+1), returned.substring(i), 0).filter(_.nonEmpty) - else if (i == max_i) List(returned) - else divide_string_where_different(original,returned, i +1) - -} - -//(2) The function occurrences calculates the number of times -// strings occur in a list of strings. These occurrences should +//(2) The function occurrences calculates the number of times +// strings occur in a list of strings. These occurrences should // be calculated as a Map from strings to integers. -def occurrences(xs: List[String]): Map[String, Int] = { - val lst = xs.distinct - val word_pairs = (for (word <- lst) yield (word, xs.count(_==word))).toList - word_pairs.toMap -} - - +//def occurrences(xs: List[String]): Map[String, Int] = .. +def occurrences(xs: List[String]) : Map[String, Int] = + xs.groupBy(identity).view.mapValues(_.size).toMap //(3) This functions calculates the dot-product of two documents // (list of strings). For this it calculates the occurrence -// maps from (2) and then multiplies the corresponding occurrences. +// maps from (2) and then multiplies the corresponding occurrences. // If a string does not occur in a document, the product is zero. -// The function finally sums up all products. +// The function finally sums up all products. -def prod(lst1: List[String], lst2: List[String]) : Int = { - val map1 = occurrences(lst1) - val map2 = occurrences(lst2) - print(s"map1 is $map1 \n and map2 is $map2") - val pairs = (for(pair1 <- map1 if(map2.get(pair1._1) != None)) yield (pair1._2, map2.get(pair1._1).get)).toList - print(s"\n pairs are $pairs") - val products = (for(pair <- pairs) yield pair._1 * pair._2).toList - products.sum - -} - +//def prod(lst1: List[String], lst2: List[String]) : Int = .. +def prod(lst1: List[String], lst2: List[String]) : Int = + occurrences(lst1).map(x => occurrences(lst2).getOrElse(x._1, 0) * x._2).reduce(_ + _) //(4) Complete the functions overlap and similarity. The overlap of // two documents is calculated by the formula given in the assignment // description. The similarity of two strings is given by the overlap -// of the cleaned strings (see (1)). +// of the cleaned strings (see (1)). //def overlap(lst1: List[String], lst2: List[String]) : Double = ... - +def overlap(lst1: List[String], lst2: List[String]) : Double = + prod(lst1, lst2).toDouble/Math.max(prod(lst1, lst1).toDouble, prod(lst2, lst2).toDouble) //def similarity(s1: String, s2: String) : Double = ... - - +def similarity(s1: String, s2: String) : Double = + overlap(clean(s1), clean(s2)) /* Test cases - - -val list1 = List("a", "b", "b", "c", "d") +import CW7a._ +val list1 = List("a", "b", "b", "c", "d") val list2 = List("d", "b", "d", "b", "d") - -occurrences(List("a", "b", "b", "c", "d")) // Map(a -> 1, b -> 2, c -> 1, d -> 1) -occurrences(List("d", "b", "d", "b", "d")) // Map(d -> 3, b -> 2) - -prod(list1,list2) // 7 -prod(list1,list1) -prod(list2,list2) - +occurrences(List("a", "b", "b", "c", "d")) +occurrences(List("d", "b", "d", "b", "d")) +prod(list1,list2) // 7 overlap(list1, list2) // 0.5384615384615384 overlap(list2, list1) // 0.5384615384615384 overlap(list1, list1) // 1.0 overlap(list2, list2) // 1.0 - -// Plagiarism examples from +// Plagiarism examples from // https://desales.libguides.com/avoidingplagiarism/examples - val orig1 = """There is a strong market demand for eco-tourism in Australia. Its rich and diverse natural heritage ensures Australia's capacity to attract international ecotourists and gives Australia a comparative advantage in the highly competitive tourism industry.""" - val plag1 = """There is a high market demand for eco-tourism in Australia. Australia has a comparative advantage in the highly competitive tourism industry due to its rich and varied natural heritage which ensures Australia's capacity to attract international ecotourists.""" - similarity(orig1, plag1) // 0.8679245283018868 - - -// Plagiarism examples from +// Plagiarism examples from // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php - val orig2 = """No oil spill is entirely benign. Depending on timing and location, even a relatively minor spill can cause significant harm to individual organisms and entire populations. Oil spills can cause @@ -125,7 +84,6 @@ (short-term) and chronic (long-term) effects. Both types are part of a complicated and often controversial equation that is addressed after an oil spill: ecosystem recovery.""" - val plag2 = """There is no such thing as a "good" oil spill. If the time and place are just right, even a small oil spill can cause damage to sensitive ecosystems. Further, spills can cause harm days, months, @@ -134,16 +92,10 @@ effects. Both of these types of harm must be addressed in ecosystem recovery: a controversial tactic that is often implemented immediately following an oil spill.""" - overlap(clean(orig2), clean(plag2)) // 0.728 similarity(orig2, plag2) // 0.728 - - - -// The punchline: everything above 0.6 looks suspicious and +// The punchline: everything above 0.6 looks suspicious and // should be investigated by staff. - */ } -