Commit cc57f9339ba0e68efa15ace33cb962f82128b9ea

Authored by Tássia Camões Araújo
1 parent 23c96b0a
Exists in master and in 1 other branch add_vagrant

Considering standard deviation for weight normalization in tfidf_plus.

Showing 2 changed files with 12 additions and 5 deletions   Show diff stats
@@ -68,7 +68,7 @@ def print_index(index): @@ -68,7 +68,7 @@ def print_index(index):
68 output += "\n---" 68 output += "\n---"
69 return output 69 return output
70 70
71 -def tfidf_weighting(index,docs,content_filter,plus=0): 71 +def tfidf_weighting(index,docs,content_filter,normalized_weigths=0):
72 """ 72 """
73 Return a dictionary of terms and weights of all terms of a set of 73 Return a dictionary of terms and weights of all terms of a set of
74 documents, based on the frequency of terms in the selected set (docids). 74 documents, based on the frequency of terms in the selected set (docids).
@@ -78,8 +78,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0): @@ -78,8 +78,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0):
78 for d in docs: 78 for d in docs:
79 for term in index.get_document(d.docid).termlist(): 79 for term in index.get_document(d.docid).termlist():
80 if content_filter(term.term): 80 if content_filter(term.term):
81 - if plus:  
82 - terms_doc.add_term(term.term,int(d.weight)) 81 + if normalized_weigths:
  82 + terms_doc.add_term(term.term,int(math.ceil(normalized_weigths[d.docid])))
83 else: 83 else:
84 terms_doc.add_term(term.term) 84 terms_doc.add_term(term.term)
85 # Compute sublinear tfidf for each term 85 # Compute sublinear tfidf for each term
@@ -104,7 +104,14 @@ def tfidf_plus(index,docs,content_filter): @@ -104,7 +104,14 @@ def tfidf_plus(index,docs,content_filter):
104 Return a dictionary of terms and weights of all terms of a set of 104 Return a dictionary of terms and weights of all terms of a set of
105 documents, based on the frequency of terms in the selected set (docids). 105 documents, based on the frequency of terms in the selected set (docids).
106 """ 106 """
107 - return tfidf_weighting(index,docs,content_filter,1) 107 + normalized_weigths = {}
  108 + population = [d.weight for d in docs]
  109 + mean = sum(population)/len(population)
  110 + variance = sum([(p-mean)*(p-mean) for p in population])/len(population)
  111 + standard_deviation = math.sqrt(variance)
  112 + for d in docs:
  113 + normalized_weigths[d.docid] = d.weight/standard_deviation
  114 + return tfidf_weighting(index,docs,content_filter,normalized_weigths)
108 115
109 class AppAptXapianIndex(xapian.WritableDatabase): 116 class AppAptXapianIndex(xapian.WritableDatabase):
110 """ 117 """
src/strategy.py
@@ -212,7 +212,7 @@ class KnnPlus(Collaborative): @@ -212,7 +212,7 @@ class KnnPlus(Collaborative):
212 KNN based packages tf-idf weights. 212 KNN based packages tf-idf weights.
213 """ 213 """
214 def __init__(self,k): 214 def __init__(self,k):
215 - self.description = "Knn" 215 + self.description = "Knn plus"
216 self.neighbours = k 216 self.neighbours = k
217 217
218 def run(self,rec,user,recommendation_size): 218 def run(self,rec,user,recommendation_size):