Commit cc57f9339ba0e68efa15ace33cb962f82128b9ea
1 parent
23c96b0a
Exists in
master
and in
1 other branch
Considering standard deviation for weight normalization in tfidf_plus.
Showing
2 changed files
with
12 additions
and
5 deletions
Show diff stats
src/data.py
... | ... | @@ -68,7 +68,7 @@ def print_index(index): |
68 | 68 | output += "\n---" |
69 | 69 | return output |
70 | 70 | |
71 | -def tfidf_weighting(index,docs,content_filter,plus=0): | |
71 | +def tfidf_weighting(index,docs,content_filter,normalized_weigths=0): | |
72 | 72 | """ |
73 | 73 | Return a dictionary of terms and weights of all terms of a set of |
74 | 74 | documents, based on the frequency of terms in the selected set (docids). |
... | ... | @@ -78,8 +78,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0): |
78 | 78 | for d in docs: |
79 | 79 | for term in index.get_document(d.docid).termlist(): |
80 | 80 | if content_filter(term.term): |
81 | - if plus: | |
82 | - terms_doc.add_term(term.term,int(d.weight)) | |
81 | + if normalized_weigths: | |
82 | + terms_doc.add_term(term.term,int(math.ceil(normalized_weigths[d.docid]))) | |
83 | 83 | else: |
84 | 84 | terms_doc.add_term(term.term) |
85 | 85 | # Compute sublinear tfidf for each term |
... | ... | @@ -104,7 +104,14 @@ def tfidf_plus(index,docs,content_filter): |
104 | 104 | Return a dictionary of terms and weights of all terms of a set of |
105 | 105 | documents, based on the frequency of terms in the selected set (docids). |
106 | 106 | """ |
107 | - return tfidf_weighting(index,docs,content_filter,1) | |
107 | + normalized_weigths = {} | |
108 | + population = [d.weight for d in docs] | |
109 | + mean = sum(population)/len(population) | |
110 | + variance = sum([(p-mean)*(p-mean) for p in population])/len(population) | |
111 | + standard_deviation = math.sqrt(variance) | |
112 | + for d in docs: | |
113 | + normalized_weigths[d.docid] = d.weight/standard_deviation | |
114 | + return tfidf_weighting(index,docs,content_filter,normalized_weigths) | |
108 | 115 | |
109 | 116 | class AppAptXapianIndex(xapian.WritableDatabase): |
110 | 117 | """ | ... | ... |
src/strategy.py
... | ... | @@ -212,7 +212,7 @@ class KnnPlus(Collaborative): |
212 | 212 | KNN based packages tf-idf weights. |
213 | 213 | """ |
214 | 214 | def __init__(self,k): |
215 | - self.description = "Knn" | |
215 | + self.description = "Knn plus" | |
216 | 216 | self.neighbours = k |
217 | 217 | |
218 | 218 | def run(self,rec,user,recommendation_size): | ... | ... |