1 // Preliminary Part about Code Similarity  | 
     1 // Preliminary Part about Code Similarity  | 
     2 //========================================  | 
     2 //========================================  | 
     3   | 
     3   | 
     4   | 
     4   | 
     5 object CW7a { // for purposes of generating a jar | 
     5 object CW7a {  | 
         | 
     6   | 
     6   | 
     7   | 
     7 //(1) Complete the clean function below. It should find  | 
     8 //(1) Complete the clean function below. It should find  | 
     8 //    all words in a string using the regular expression  | 
     9 //    all words in a string using the regular expression  | 
     9 //    \w+  and the library function   | 
    10 //    \w+  and the library function   | 
    10 //  | 
    11 //  | 
    11 //         some_regex.findAllIn(some_string)  | 
    12 //         some_regex.findAllIn(some_string)  | 
    12 //  | 
    13 //  | 
    13 //    The words should be Returned as a list of strings.  | 
    14 //    The words should be Returned as a list of strings.  | 
    14   | 
    15   | 
    15 def clean(s: String) : List[String] =   | 
         | 
    16   ("""\w+""".r).findAllIn(s).toList | 
         | 
    17   | 
    16   | 
         | 
    17 def clean(s: String) : List[String] = { | 
         | 
    18     val regex = """\w+""".r;  | 
         | 
    19     val list_of_words = s.split(" ").toList | 
         | 
    20     for(word <- list_of_words;  | 
         | 
    21         actual_word <- divide_string_where_different(word, regex.findAllIn(word).mkString, 0)) yield actual_word  | 
         | 
    22 }  | 
         | 
    23   | 
         | 
    24 /*  | 
         | 
    25     A secondary function that takes as parameters @param original which is the original word, @param returned which is thea word after the process of removing   | 
         | 
    26     some characters not allowed by a regular expression, and @param i which is the index where to start compare the characters of the two words.  | 
         | 
    27     It @return a List of strings which represents all the substrings of returned which were previously divided by characters not allowed by the regular expression applied on it.  | 
         | 
    28 */  | 
         | 
    29 def divide_string_where_different(original: String, returned: String, i : Int): List[String] ={ | 
         | 
    30     val max_i = original.length -1  | 
         | 
    31     if(original(i) != returned(i)) returned.substring(0, i)::divide_string_where_different(original.substring(i+1), returned.substring(i), 0).filter(_.nonEmpty)  | 
         | 
    32     else if (i == max_i) List(returned)  | 
         | 
    33     else divide_string_where_different(original,returned, i +1)  | 
         | 
    34       | 
         | 
    35 }  | 
    18   | 
    36   | 
    19 //(2) The function occurrences calculates the number of times    | 
    37 //(2) The function occurrences calculates the number of times    | 
    20 //    strings occur in a list of strings. These occurrences should   | 
    38 //    strings occur in a list of strings. These occurrences should   | 
    21 //    be calculated as a Map from strings to integers.  | 
    39 //    be calculated as a Map from strings to integers.  | 
    22   | 
    40   | 
    23 def occurrences(xs: List[String]): Map[String, Int] =  | 
    41   | 
    24   (for (x <- xs.distinct) yield (x, xs.count(_ == x))).toMap  | 
    42 def occurrences(xs: List[String]): Map[String, Int] = { | 
         | 
    43     val lst = xs.distinct  | 
         | 
    44     val word_pairs = (for (word <- lst) yield (word, xs.count(_==word))).toList  | 
         | 
    45     word_pairs.toMap  | 
         | 
    46 }  | 
         | 
    47   | 
         | 
    48   | 
    25   | 
    49   | 
    26 //(3) This functions calculates the dot-product of two documents  | 
    50 //(3) This functions calculates the dot-product of two documents  | 
    27 //    (list of strings). For this it calculates the occurrence  | 
    51 //    (list of strings). For this it calculates the occurrence  | 
    28 //    maps from (2) and then multiplies the corresponding occurrences.   | 
    52 //    maps from (2) and then multiplies the corresponding occurrences.   | 
    29 //    If a string does not occur in a document, the product is zero.  | 
    53 //    If a string does not occur in a document, the product is zero.  | 
    30 //    The function finally sums up all products.   | 
    54 //    The function finally sums up all products.   | 
    31   | 
    55   | 
         | 
    56   | 
    32 def prod(lst1: List[String], lst2: List[String]) : Int = { | 
    57 def prod(lst1: List[String], lst2: List[String]) : Int = { | 
    33     val words = (lst1 ::: lst2).distinct  | 
    58     val map1 = occurrences(lst1)  | 
    34     val occs1 = occurrences(lst1)  | 
    59     val map2 = occurrences(lst2)  | 
    35     val occs2 = occurrences(lst2)  | 
    60     print(s"map1 is $map1 \n and map2 is $map2")  | 
    36     words.map{ w => occs1.getOrElse(w, 0) * occs2.getOrElse(w, 0) }.sum | 
    61     val pairs = (for(pair1 <- map1 if(map2.get(pair1._1) != None)) yield (pair1._2, map2.get(pair1._1).get)).toList  | 
         | 
    62     print(s"\n pairs are $pairs")  | 
         | 
    63     val products = (for(pair <- pairs) yield pair._1 * pair._2).toList  | 
         | 
    64     products.sum  | 
         | 
    65   | 
    37 }  | 
    66 }  | 
         | 
    67   | 
    38   | 
    68   | 
    39 //(4) Complete the functions overlap and similarity. The overlap of  | 
    69 //(4) Complete the functions overlap and similarity. The overlap of  | 
    40 //    two documents is calculated by the formula given in the assignment  | 
    70 //    two documents is calculated by the formula given in the assignment  | 
    41 //    description. The similarity of two strings is given by the overlap  | 
    71 //    description. The similarity of two strings is given by the overlap  | 
    42 //    of the cleaned (see (1)) strings.    | 
    72 //    of the cleaned strings (see (1)).    | 
    43   | 
         | 
    44 def overlap(lst1: List[String], lst2: List[String]) : Double = { | 
         | 
    45     val m1 = prod(lst1, lst1)  | 
         | 
    46     val m2 = prod(lst2, lst2)   | 
         | 
    47     prod(lst1, lst2).toDouble / (List(m1, m2).max)  | 
         | 
    48 }  | 
         | 
    49   | 
         | 
    50 def similarity(s1: String, s2: String) : Double =  | 
         | 
    51   overlap(clean(s1), clean(s2))  | 
         | 
    52   | 
    73   | 
    53   | 
    74   | 
    54 /*  | 
    75 //def overlap(lst1: List[String], lst2: List[String]) : Double = ...  | 
         | 
    76   | 
         | 
    77 //def similarity(s1: String, s2: String) : Double = ...  | 
         | 
    78   | 
         | 
    79   | 
         | 
    80   | 
         | 
    81   | 
         | 
    82 /* Test cases  | 
    55   | 
    83   | 
    56   | 
    84   | 
    57 val list1 = List("a", "b", "b", "c", "d")  | 
    85 val list1 = List("a", "b", "b", "c", "d")  | 
    58 val list2 = List("d", "b", "d", "b", "d") | 
    86 val list2 = List("d", "b", "d", "b", "d") | 
    59   | 
    87   | 
    60 occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1) | 
    88 occurrences(List("a", "b", "b", "c", "d"))   // Map(a -> 1, b -> 2, c -> 1, d -> 1) | 
    61 occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2) | 
    89 occurrences(List("d", "b", "d", "b", "d"))   // Map(d -> 3, b -> 2) | 
    62   | 
    90   | 
    63 prod(list1,list2) // 7   | 
    91 prod(list1,list2) // 7   | 
         | 
    92 prod(list1,list1)  | 
         | 
    93 prod(list2,list2)  | 
    64   | 
    94   | 
    65 overlap(list1, list2)   // 0.5384615384615384  | 
    95 overlap(list1, list2)   // 0.5384615384615384  | 
    66 overlap(list2, list1)   // 0.5384615384615384  | 
    96 overlap(list2, list1)   // 0.5384615384615384  | 
    67 overlap(list1, list1)   // 1.0  | 
    97 overlap(list1, list1)   // 1.0  | 
    68 overlap(list2, list2)   // 1.0  | 
    98 overlap(list2, list2)   // 1.0  |