diff -r a623dd1f2898 -r e03a0100ec46 core_testing2/docdiff.scala --- a/core_testing2/docdiff.scala Mon Nov 06 21:49:55 2023 +0000 +++ b/core_testing2/docdiff.scala Fri Dec 08 00:54:36 2023 +0000 @@ -1,57 +1,41 @@ -// Preliminary Part about Code Similarity -//======================================== +// Core Part 2 about Code Similarity +//=================================== object C2 { -//(1) Complete the clean function below. It should find -// all words in a string using the regular expression -// \w+ and the library function -// -// some_regex.findAllIn(some_string) -// -// The words should be Returned as a list of strings. +// ADD YOUR CODE BELOW +//====================== + +//(1) +def clean(s: String) : List[String] = """(\w+)""".r.findAllIn(s).toList + -def clean(s: String) : List[String] = - ("""\w+""".r).findAllIn(s).toList + +//(2) +def occurrences(xs: List[String]): Map[String, Int] = { + val ls = xs.distinct + val occLs = for (s <- ls) yield (s, xs.count(_.equals(s))) + occLs.toMap +} -//(2) The function occurrences calculates the number of times -// strings occur in a list of strings. These occurrences should -// be calculated as a Map from strings to integers. - -def occurrences(xs: List[String]): Map[String, Int] = - (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap - -//(3) This functions calculates the dot-product of two documents -// (list of strings). For this it calculates the occurrence -// maps from (2) and then multiplies the corresponding occurrences. -// If a string does not occur in a document, the product is zero. -// The function finally sums up all products. - +//(3) def prod(lst1: List[String], lst2: List[String]) : Int = { - val words = (lst1 ::: lst2).distinct - val occs1 = occurrences(lst1) - val occs2 = occurrences(lst2) - words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum -} - -//(4) Complete the functions overlap and similarity. The overlap of -// two documents is calculated by the formula given in the assignment -// description. The similarity of two strings is given by the overlap -// of the cleaned (see (1)) strings. - -def overlap(lst1: List[String], lst2: List[String]) : Double = { - val m1 = prod(lst1, lst1) - val m2 = prod(lst2, lst2) - prod(lst1, lst2).toDouble / (List(m1, m2).max) + val occM1 = occurrences(lst1) + val occM2 = occurrences(lst2) + (for (s <- occM1) yield s._2 * occM2.getOrElse(s._1,0)).sum } -def similarity(s1: String, s2: String) : Double = - overlap(clean(s1), clean(s2)) + +//(4) +def overlap(lst1: List[String], lst2: List[String]) : Double = prod(lst1,lst2) / prod(lst1,lst1).max(prod(lst2,lst2)) + +def similarity(s1: String, s2: String) : Double = overlap(clean(s1), clean(s2)) -/* + +/* Test cases val list1 = List("a", "b", "b", "c", "d") @@ -81,7 +65,7 @@ heritage which ensures Australia's capacity to attract international ecotourists.""" -similarity(orig1, plag1) +similarity(orig1, plag1) // 0.8679245283018868 // Plagiarism examples from @@ -105,13 +89,14 @@ recovery: a controversial tactic that is often implemented immediately following an oil spill.""" -overlap(clean(orig2), clean(plag2)) -similarity(orig2, plag2) +overlap(clean(orig2), clean(plag2)) // 0.728 +similarity(orig2, plag2) // 0.728 + + // The punchline: everything above 0.6 looks suspicious and -// should be looked at by staff. +// should be investigated by staff. */ - }