testing2/docdiff.scala
changeset 320 cdfb2ce30a3d
parent 283 ef5f62bf5987
child 323 1f8005b4cdf6
equal deleted inserted replaced
319:b84ea52bfd8f 320:cdfb2ce30a3d
     1 // Preliminary Part about Code Similarity
     1 // Preliminary Part about Code Similarity
     2 //========================================
     2 //========================================
     3 
     3 
     4 
     4 
     5 object CW7a { // for purposes of generating a jar
     5 object CW7a { 
       
     6 
     6 
     7 
     7 //(1) Complete the clean function below. It should find
     8 //(1) Complete the clean function below. It should find
     8 //    all words in a string using the regular expression
     9 //    all words in a string using the regular expression
     9 //    \w+  and the library function 
    10 //    \w+  and the library function 
    10 //
    11 //
    11 //         some_regex.findAllIn(some_string)
    12 //         some_regex.findAllIn(some_string)
    12 //
    13 //
    13 //    The words should be Returned as a list of strings.
    14 //    The words should be Returned as a list of strings.
    14 
    15 
    15 def clean(s: String) : List[String] = 
       
    16   ("""\w+""".r).findAllIn(s).toList
       
    17 
    16 
       
    17 def clean(s: String) : List[String] = {
       
    18     val regex = """\w+""".r;
       
    19     val list_of_words = s.split(" ").toList
       
    20     for(word <- list_of_words;
       
    21         actual_word <- divide_string_where_different(word, regex.findAllIn(word).mkString, 0)) yield actual_word
       
    22 }
       
    23 
       
    24 /*
       
    25     A secondary function that takes as parameters @param original which is the original word, @param returned which is thea word after the process of removing 
       
    26     some characters not allowed by a regular expression, and @param i which is the index where to start compare the characters of the two words.
       
    27     It @return a List of strings which represents all the substrings of returned which were previously divided by characters not allowed by the regular expression applied on it.
       
    28 */
       
    29 def divide_string_where_different(original: String, returned: String, i : Int): List[String] ={
       
    30     val max_i = original.length -1
       
    31     if(original(i) != returned(i)) returned.substring(0, i)::divide_string_where_different(original.substring(i+1), returned.substring(i), 0).filter(_.nonEmpty)
       
    32     else if (i == max_i) List(returned)
       
    33     else divide_string_where_different(original,returned, i +1)
       
    34     
       
    35 }
    18 
    36 
    19 //(2) The function occurrences calculates the number of times  
    37 //(2) The function occurrences calculates the number of times  
    20 //    strings occur in a list of strings. These occurrences should 
    38 //    strings occur in a list of strings. These occurrences should 
    21 //    be calculated as a Map from strings to integers.
    39 //    be calculated as a Map from strings to integers.
    22 
    40 
    23 def occurrences(xs: List[String]): Map[String, Int] =
    41 
    24   (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap
    42 def occurrences(xs: List[String]): Map[String, Int] = {
       
    43     val lst = xs.distinct
       
    44     val word_pairs = (for (word <- lst) yield (word, xs.count(_==word))).toList
       
    45     word_pairs.toMap
       
    46 }
       
    47 
       
    48 
    25 
    49 
    26 //(3) This functions calculates the dot-product of two documents
    50 //(3) This functions calculates the dot-product of two documents
    27 //    (list of strings). For this it calculates the occurrence
    51 //    (list of strings). For this it calculates the occurrence
    28 //    maps from (2) and then multiplies the corresponding occurrences. 
    52 //    maps from (2) and then multiplies the corresponding occurrences. 
    29 //    If a string does not occur in a document, the product is zero.
    53 //    If a string does not occur in a document, the product is zero.
    30 //    The function finally sums up all products. 
    54 //    The function finally sums up all products. 
    31 
    55 
       
    56 
    32 def prod(lst1: List[String], lst2: List[String]) : Int = {
    57 def prod(lst1: List[String], lst2: List[String]) : Int = {
    33     val words = (lst1 ::: lst2).distinct
    58     val map1 = occurrences(lst1)
    34     val occs1 = occurrences(lst1)
    59     val map2 = occurrences(lst2)
    35     val occs2 = occurrences(lst2)
    60     print(s"map1 is $map1 \n and map2 is $map2")
    36     words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum
    61     val pairs = (for(pair1 <- map1 if(map2.get(pair1._1) != None)) yield (pair1._2, map2.get(pair1._1).get)).toList
       
    62     print(s"\n pairs are $pairs")
       
    63     val products = (for(pair <- pairs) yield pair._1 * pair._2).toList
       
    64     products.sum
       
    65 
    37 }
    66 }
       
    67 
    38 
    68 
    39 //(4) Complete the functions overlap and similarity. The overlap of
    69 //(4) Complete the functions overlap and similarity. The overlap of
    40 //    two documents is calculated by the formula given in the assignment
    70 //    two documents is calculated by the formula given in the assignment
    41 //    description. The similarity of two strings is given by the overlap
    71 //    description. The similarity of two strings is given by the overlap
    42 //    of the cleaned (see (1)) strings.  
    72 //    of the cleaned strings (see (1)).  
    43 
       
    44 def overlap(lst1: List[String], lst2: List[String]) : Double = {
       
    45     val m1 = prod(lst1, lst1)
       
    46     val m2 = prod(lst2, lst2) 
       
    47     prod(lst1, lst2).toDouble / (List(m1, m2).max)
       
    48 }
       
    49 
       
    50 def similarity(s1: String, s2: String) : Double =
       
    51   overlap(clean(s1), clean(s2))
       
    52 
    73 
    53 
    74 
    54 /*
    75 //def overlap(lst1: List[String], lst2: List[String]) : Double = ...
       
    76 
       
    77 //def similarity(s1: String, s2: String) : Double = ...
       
    78 
       
    79 
       
    80 
       
    81 
       
    82 /* Test cases
    55 
    83 
    56 
    84 
    57 val list1 = List("a", "b", "b", "c", "d") 
    85 val list1 = List("a", "b", "b", "c", "d") 
    58 val list2 = List("d", "b", "d", "b", "d")
    86 val list2 = List("d", "b", "d", "b", "d")
    59 
    87 
    60 occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1)
    88 occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1)
    61 occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2)
    89 occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2)
    62 
    90 
    63 prod(list1,list2) // 7 
    91 prod(list1,list2) // 7 
       
    92 prod(list1,list1)
       
    93 prod(list2,list2)
    64 
    94 
    65 overlap(list1, list2)   // 0.5384615384615384
    95 overlap(list1, list2)   // 0.5384615384615384
    66 overlap(list2, list1)   // 0.5384615384615384
    96 overlap(list2, list1)   // 0.5384615384615384
    67 overlap(list1, list1)   // 1.0
    97 overlap(list1, list1)   // 1.0
    68 overlap(list2, list2)   // 1.0
    98 overlap(list2, list2)   // 1.0
    79 Australia. Australia has a comparative advantage in the highly
   109 Australia. Australia has a comparative advantage in the highly
    80 competitive tourism industry due to its rich and varied natural
   110 competitive tourism industry due to its rich and varied natural
    81 heritage which ensures Australia's capacity to attract international
   111 heritage which ensures Australia's capacity to attract international
    82 ecotourists."""
   112 ecotourists."""
    83 
   113 
    84 similarity(orig1, plag1)
   114 similarity(orig1, plag1) // 0.8679245283018868
    85 
   115 
    86 
   116 
    87 // Plagiarism examples from 
   117 // Plagiarism examples from 
    88 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
   118 // https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
    89 
   119 
   103 usually broken into short-term (acute) and long-term (chronic)
   133 usually broken into short-term (acute) and long-term (chronic)
   104 effects. Both of these types of harm must be addressed in ecosystem
   134 effects. Both of these types of harm must be addressed in ecosystem
   105 recovery: a controversial tactic that is often implemented immediately
   135 recovery: a controversial tactic that is often implemented immediately
   106 following an oil spill."""
   136 following an oil spill."""
   107 
   137 
   108 overlap(clean(orig2), clean(plag2))
   138 overlap(clean(orig2), clean(plag2))  // 0.728
   109 similarity(orig2, plag2)
   139 similarity(orig2, plag2)             // 0.728
   110 
   140 
       
   141 
       
   142  
   111 // The punchline: everything above 0.6 looks suspicious and 
   143 // The punchline: everything above 0.6 looks suspicious and 
   112 // should be looked at by staff.
   144 // should be investigated by staff.
   113 
   145 
   114 */
   146 */
   115 
   147 
       
   148 }
   116 
   149 
   117 }