Considering standard deviation for weight normalization in tfidf_plus.

Tássia Camões Araújo
1 parent 23c96b0a
Showing 2 changed files with 12 additions and 5 deletions Show diff stats
src/data.py
src/strategy.py
@@ -68,7 +68,7 @@ def print_index(index):
         output += "\n---"
     return output
-def tfidf_weighting(index,docs,content_filter,plus=0):
+def tfidf_weighting(index,docs,content_filter,normalized_weigths=0):
     """
     Return a dictionary of terms and weights of all terms of a set of
     documents, based on the frequency of terms in the selected set (docids).
@@ -78,8 +78,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0):
     for d in docs:
         for term in index.get_document(d.docid).termlist():
             if content_filter(term.term):
-                if plus:
-                    terms_doc.add_term(term.term,int(d.weight))
+                if normalized_weigths:
+                    terms_doc.add_term(term.term,int(math.ceil(normalized_weigths[d.docid])))
                 else:
                     terms_doc.add_term(term.term)
     # Compute sublinear tfidf for each term
@@ -104,7 +104,14 @@ def tfidf_plus(index,docs,content_filter):
     Return a dictionary of terms and weights of all terms of a set of
     documents, based on the frequency of terms in the selected set (docids).
     """
-    return tfidf_weighting(index,docs,content_filter,1)
+    normalized_weigths = {}
+    population = [d.weight for d in docs]
+    mean = sum(population)/len(population)
+    variance = sum([(p-mean)*(p-mean) for p in population])/len(population)
+    standard_deviation = math.sqrt(variance)
+    for d in docs:
+        normalized_weigths[d.docid] = d.weight/standard_deviation
+    return tfidf_weighting(index,docs,content_filter,normalized_weigths)
 class AppAptXapianIndex(xapian.WritableDatabase):
     """
@@ -212,7 +212,7 @@ class KnnPlus(Collaborative):
     KNN based packages tf-idf weights.
     """
     def __init__(self,k):
-        self.description = "Knn"
+        self.description = "Knn plus"
         self.neighbours = k
     def run(self,rec,user,recommendation_size):
	@@ -212,7 +212,7 @@ class KnnPlus(Collaborative):		@@ -212,7 +212,7 @@ class KnnPlus(Collaborative):
212	KNN based packages tf-idf weights.	212	KNN based packages tf-idf weights.
213	"""	213	"""
214	def __init__(self,k):	214	def __init__(self,k):
215	- self.description = "Knn"	215	+ self.description = "Knn plus"
216	self.neighbours = k	216	self.neighbours = k
217		217
218	def run(self,rec,user,recommendation_size):	218	def run(self,rec,user,recommendation_size):