Commit cc57f9339ba0e68efa15ace33cb962f82128b9ea

Authored by Tássia Camões Araújo
1 parent 23c96b0a
Exists in master and in 1 other branch add_vagrant

Considering standard deviation for weight normalization in tfidf_plus.

Showing 2 changed files with 12 additions and 5 deletions   Show diff stats
src/data.py
... ... @@ -68,7 +68,7 @@ def print_index(index):
68 68 output += "\n---"
69 69 return output
70 70  
71   -def tfidf_weighting(index,docs,content_filter,plus=0):
  71 +def tfidf_weighting(index,docs,content_filter,normalized_weigths=0):
72 72 """
73 73 Return a dictionary of terms and weights of all terms of a set of
74 74 documents, based on the frequency of terms in the selected set (docids).
... ... @@ -78,8 +78,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0):
78 78 for d in docs:
79 79 for term in index.get_document(d.docid).termlist():
80 80 if content_filter(term.term):
81   - if plus:
82   - terms_doc.add_term(term.term,int(d.weight))
  81 + if normalized_weigths:
  82 + terms_doc.add_term(term.term,int(math.ceil(normalized_weigths[d.docid])))
83 83 else:
84 84 terms_doc.add_term(term.term)
85 85 # Compute sublinear tfidf for each term
... ... @@ -104,7 +104,14 @@ def tfidf_plus(index,docs,content_filter):
104 104 Return a dictionary of terms and weights of all terms of a set of
105 105 documents, based on the frequency of terms in the selected set (docids).
106 106 """
107   - return tfidf_weighting(index,docs,content_filter,1)
  107 + normalized_weigths = {}
  108 + population = [d.weight for d in docs]
  109 + mean = sum(population)/len(population)
  110 + variance = sum([(p-mean)*(p-mean) for p in population])/len(population)
  111 + standard_deviation = math.sqrt(variance)
  112 + for d in docs:
  113 + normalized_weigths[d.docid] = d.weight/standard_deviation
  114 + return tfidf_weighting(index,docs,content_filter,normalized_weigths)
108 115  
109 116 class AppAptXapianIndex(xapian.WritableDatabase):
110 117 """
... ...
src/strategy.py
... ... @@ -212,7 +212,7 @@ class KnnPlus(Collaborative):
212 212 KNN based packages tf-idf weights.
213 213 """
214 214 def __init__(self,k):
215   - self.description = "Knn"
  215 + self.description = "Knn plus"
216 216 self.neighbours = k
217 217  
218 218 def run(self,rec,user,recommendation_size):
... ...