pep-material: testing2/docdiff.scala@90aed247c8cf (annotated)

283 3102d61edf45 updated Christian Urban <urbanc@in.tum.de> parents: 211 diff changeset	1	// Preliminary Part about Code Similarity
3102d61edf45 updated Christian Urban <urbanc@in.tum.de> parents: 211 diff changeset	2	//========================================
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	3
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	4
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	5	object CW7a {
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	6
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	7
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	8	//(1) Complete the clean function below. It should find
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	9	// all words in a string using the regular expression
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	10	// \w+ and the library function
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	11	//
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	12	// some_regex.findAllIn(some_string)
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	13	//
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	14	// The words should be Returned as a list of strings.
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	15
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	16
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	17	def clean(s: String) : List[String] = {
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	18	val regex = """\w+""".r;
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	19	val list_of_words = s.split(" ").toList
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	20	for(word <- list_of_words;
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	21	actual_word <- divide_string_where_different(word, regex.findAllIn(word).mkString, 0)) yield actual_word
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	22	}
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	23
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	24	/*
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	25	A secondary function that takes as parameters @param original which is the original word, @param returned which is thea word after the process of removing
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	26	some characters not allowed by a regular expression, and @param i which is the index where to start compare the characters of the two words.
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	27	It @return a List of strings which represents all the substrings of returned which were previously divided by characters not allowed by the regular expression applied on it.
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	28	*/
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	29	def divide_string_where_different(original: String, returned: String, i : Int): List[String] ={
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	30	val max_i = original.length -1
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	31	if(original(i) != returned(i)) returned.substring(0, i)::divide_string_where_different(original.substring(i+1), returned.substring(i), 0).filter(_.nonEmpty)
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	32	else if (i == max_i) List(returned)
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	33	else divide_string_where_different(original,returned, i +1)
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	34
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	35	}
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	36
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	37	//(2) The function occurrences calculates the number of times
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	38	// strings occur in a list of strings. These occurrences should
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	39	// be calculated as a Map from strings to integers.
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	40
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	41
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	42	def occurrences(xs: List[String]): Map[String, Int] = {
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	43	val lst = xs.distinct
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	44	val word_pairs = (for (word <- lst) yield (word, xs.count(_==word))).toList
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	45	word_pairs.toMap
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	46	}
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	47
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	48
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	49
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	50	//(3) This functions calculates the dot-product of two documents
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	51	// (list of strings). For this it calculates the occurrence
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	52	// maps from (2) and then multiplies the corresponding occurrences.
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	53	// If a string does not occur in a document, the product is zero.
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	54	// The function finally sums up all products.
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	55
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	56
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	57	def prod(lst1: List[String], lst2: List[String]) : Int = {
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	58	val map1 = occurrences(lst1)
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	59	val map2 = occurrences(lst2)
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	60	print(s"map1 is $map1 \n and map2 is $map2")
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	61	val pairs = (for(pair1 <- map1 if(map2.get(pair1._1) != None)) yield (pair1._2, map2.get(pair1._1).get)).toList
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	62	print(s"\n pairs are $pairs")
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	63	val products = (for(pair <- pairs) yield pair._1 * pair._2).toList
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	64	products.sum
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	65
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	66	}
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	67
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	68
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	69	//(4) Complete the functions overlap and similarity. The overlap of
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	70	// two documents is calculated by the formula given in the assignment
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	71	// description. The similarity of two strings is given by the overlap
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	72	// of the cleaned strings (see (1)).
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	73
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	74
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	75	//def overlap(lst1: List[String], lst2: List[String]) : Double = ...
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	76
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	77	//def similarity(s1: String, s2: String) : Double = ...
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	78
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	79
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	80
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	81
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	82	/* Test cases
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	83
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	84
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	85	val list1 = List("a", "b", "b", "c", "d")
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	86	val list2 = List("d", "b", "d", "b", "d")
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	87
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	88	occurrences(List("a", "b", "b", "c", "d")) // Map(a -> 1, b -> 2, c -> 1, d -> 1)
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	89	occurrences(List("d", "b", "d", "b", "d")) // Map(d -> 3, b -> 2)
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	90
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	91	prod(list1,list2) // 7
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	92	prod(list1,list1)
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	93	prod(list2,list2)
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	94
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	95	overlap(list1, list2) // 0.5384615384615384
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	96	overlap(list2, list1) // 0.5384615384615384
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	97	overlap(list1, list1) // 1.0
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	98	overlap(list2, list2) // 1.0
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	99
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	100	// Plagiarism examples from
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	101	// https://desales.libguides.com/avoidingplagiarism/examples
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	102
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	103	val orig1 = """There is a strong market demand for eco-tourism in
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	104	Australia. Its rich and diverse natural heritage ensures Australia's
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	105	capacity to attract international ecotourists and gives Australia a
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	106	comparative advantage in the highly competitive tourism industry."""
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	107
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	108	val plag1 = """There is a high market demand for eco-tourism in
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	109	Australia. Australia has a comparative advantage in the highly
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	110	competitive tourism industry due to its rich and varied natural
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	111	heritage which ensures Australia's capacity to attract international
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	112	ecotourists."""
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	113
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	114	similarity(orig1, plag1) // 0.8679245283018868
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	115
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	116
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	117	// Plagiarism examples from
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	118	// https://www.utc.edu/library/help/tutorials/plagiarism/examples-of-plagiarism.php
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	119
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	120	val orig2 = """No oil spill is entirely benign. Depending on timing and
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	121	location, even a relatively minor spill can cause significant harm to
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	122	individual organisms and entire populations. Oil spills can cause
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	123	impacts over a range of time scales, from days to years, or even
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	124	decades for certain spills. Impacts are typically divided into acute
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	125	(short-term) and chronic (long-term) effects. Both types are part of a
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	126	complicated and often controversial equation that is addressed after
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	127	an oil spill: ecosystem recovery."""
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	128
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	129	val plag2 = """There is no such thing as a "good" oil spill. If the
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	130	time and place are just right, even a small oil spill can cause damage
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	131	to sensitive ecosystems. Further, spills can cause harm days, months,
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	132	years, or even decades after they occur. Because of this, spills are
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	133	usually broken into short-term (acute) and long-term (chronic)
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	134	effects. Both of these types of harm must be addressed in ecosystem
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	135	recovery: a controversial tactic that is often implemented immediately
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	136	following an oil spill."""
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	137
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	138	overlap(clean(orig2), clean(plag2)) // 0.728
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	139	similarity(orig2, plag2) // 0.728
90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	140
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	141
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	142
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	143	// The punchline: everything above 0.6 looks suspicious and
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	144	// should be investigated by staff.
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	145
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	146	*/
1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	147
320 90aed247c8cf updated Christian Urban <urbanc@in.tum.de> parents: 283 diff changeset	148	}
211 1859d978b18e updated Christian Urban <urbanc@in.tum.de> parents: diff changeset	149

author	Christian Urban <urbanc@in.tum.de>
	Tue, 19 Nov 2019 00:40:27 +0000
changeset 320	90aed247c8cf
parent 283	3102d61edf45
child 323	93b6c16dded8
permissions	-rw-r--r--