testing2/docdiff.scala
changeset 323 1f8005b4cdf6
parent 320 cdfb2ce30a3d
--- a/testing2/docdiff.scala	Tue Nov 19 06:38:20 2019 +0000
+++ b/testing2/docdiff.scala	Fri Nov 22 16:41:45 2019 +0000
@@ -1,122 +1,81 @@
 // Preliminary Part about Code Similarity
 //========================================
 
-
-object CW7a { 
+object CW7a {
 
 
 //(1) Complete the clean function below. It should find
 //    all words in a string using the regular expression
-//    \w+  and the library function 
+//    \w+  and the library function
 //
 //         some_regex.findAllIn(some_string)
 //
 //    The words should be Returned as a list of strings.
 
 
-def clean(s: String) : List[String] = {
-    val regex = """\w+""".r;
-    val list_of_words = s.split(" ").toList
-    for(word <- list_of_words;
-        actual_word <- divide_string_where_different(word, regex.findAllIn(word).mkString, 0)) yield actual_word
-}
+//def clean(s: String) : List[String] = ...
+def clean(s: String) : List[String] =
+    "\\w+".r.findAllIn(s).toList
 
-/*
-    A secondary function that takes as parameters @param original which is the original word, @param returned which is thea word after the process of removing 
-    some characters not allowed by a regular expression, and @param i which is the index where to start compare the characters of the two words.
-    It @return a List of strings which represents all the substrings of returned which were previously divided by characters not allowed by the regular expression applied on it.
-*/
-def divide_string_where_different(original: String, returned: String, i : Int): List[String] ={
-    val max_i = original.length -1
-    if(original(i) != returned(i)) returned.substring(0, i)::divide_string_where_different(original.substring(i+1), returned.substring(i), 0).filter(_.nonEmpty)
-    else if (i == max_i) List(returned)
-    else divide_string_where_different(original,returned, i +1)
-    
-}
-
-//(2) The function occurrences calculates the number of times  
-//    strings occur in a list of strings. These occurrences should 
+//(2) The function occurrences calculates the number of times
+//    strings occur in a list of strings. These occurrences should
 //    be calculated as a Map from strings to integers.
 
 
-def occurrences(xs: List[String]): Map[String, Int] = {
-    val lst = xs.distinct
-    val word_pairs = (for (word <- lst) yield (word, xs.count(_==word))).toList
-    word_pairs.toMap
-}
-
-
+//def occurrences(xs: List[String]): Map[String, Int] = ..
+def occurrences(xs: List[String]) : Map[String, Int] =
+    xs.groupBy(identity).view.mapValues(_.size).toMap
 
 //(3) This functions calculates the dot-product of two documents
 //    (list of strings). For this it calculates the occurrence
-//    maps from (2) and then multiplies the corresponding occurrences. 
+//    maps from (2) and then multiplies the corresponding occurrences.
 //    If a string does not occur in a document, the product is zero.
-//    The function finally sums up all products. 
+//    The function finally sums up all products.
 
 
-def prod(lst1: List[String], lst2: List[String]) : Int = {
-    val map1 = occurrences(lst1)
-    val map2 = occurrences(lst2)
-    print(s"map1 is $map1 \n and map2 is $map2")
-    val pairs = (for(pair1 <- map1 if(map2.get(pair1._1) != None)) yield (pair1._2, map2.get(pair1._1).get)).toList
-    print(s"\n pairs are $pairs")
-    val products = (for(pair <- pairs) yield pair._1 * pair._2).toList
-    products.sum
-
-}
-
+//def prod(lst1: List[String], lst2: List[String]) : Int = ..
+def prod(lst1: List[String], lst2: List[String]) : Int =
+    occurrences(lst1).map(x => occurrences(lst2).getOrElse(x._1, 0) * x._2).reduce(_ + _)
 
 //(4) Complete the functions overlap and similarity. The overlap of
 //    two documents is calculated by the formula given in the assignment
 //    description. The similarity of two strings is given by the overlap
-//    of the cleaned strings (see (1)).  
+//    of the cleaned strings (see (1)).
 
 
 //def overlap(lst1: List[String], lst2: List[String]) : Double = ...
-
+def overlap(lst1: List[String], lst2: List[String]) : Double =
+    prod(lst1, lst2).toDouble/Math.max(prod(lst1, lst1).toDouble, prod(lst2, lst2).toDouble)
 //def similarity(s1: String, s2: String) : Double = ...
-
-
+def similarity(s1: String, s2: String) : Double =
+    overlap(clean(s1), clean(s2))
 
 
 /* Test cases
-
-
-val list1 = List("a", "b", "b", "c", "d") 
+import CW7a._
+val list1 = List("a", "b", "b", "c", "d")
 val list2 = List("d", "b", "d", "b", "d")
-
-occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1)
-occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2)
-
-prod(list1,list2) // 7 
-prod(list1,list1)
-prod(list2,list2)
-
+occurrences(List("a", "b", "b", "c", "d"))
+occurrences(List("d", "b", "d", "b", "d"))
+prod(list1,list2) // 7
 overlap(list1, list2)   // 0.5384615384615384
 overlap(list2, list1)   // 0.5384615384615384
 overlap(list1, list1)   // 1.0
 overlap(list2, list2)   // 1.0
-
-// Plagiarism examples from 
+// Plagiarism examples from
 // https://desales.libguides.com/avoidingplagiarism/examples
-
 val orig1 = """There is a strong market demand for eco-tourism in
 Australia. Its rich and diverse natural heritage ensures Australia's
 capacity to attract international ecotourists and gives Australia a
 comparative advantage in the highly competitive tourism industry."""
-
 val plag1 = """There is a high market demand for eco-tourism in
 Australia. Australia has a comparative advantage in the highly
 competitive tourism industry due to its rich and varied natural
 heritage which ensures Australia's capacity to attract international
 ecotourists."""
-
 similarity(orig1, plag1) // 0.8679245283018868
-
-
-// Plagiarism examples from 
+// Plagiarism examples from
 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
-
 val orig2 = """No oil spill is entirely benign. Depending on timing and
 location, even a relatively minor spill can cause significant harm to
 individual organisms and entire populations. Oil spills can cause
@@ -125,7 +84,6 @@
 (short-term) and chronic (long-term) effects. Both types are part of a
 complicated and often controversial equation that is addressed after
 an oil spill: ecosystem recovery."""
-
 val plag2 = """There is no such thing as a "good" oil spill. If the
 time and place are just right, even a small oil spill can cause damage
 to sensitive ecosystems. Further, spills can cause harm days, months,
@@ -134,16 +92,10 @@
 effects. Both of these types of harm must be addressed in ecosystem
 recovery: a controversial tactic that is often implemented immediately
 following an oil spill."""
-
 overlap(clean(orig2), clean(plag2))  // 0.728
 similarity(orig2, plag2)             // 0.728
-
-
- 
-// The punchline: everything above 0.6 looks suspicious and 
+// The punchline: everything above 0.6 looks suspicious and
 // should be investigated by staff.
-
 */
 
 }
-