Commit cc57f9339ba0e68efa15ace33cb962f82128b9ea
1 parent
23c96b0a
Exists in
master
and in
1 other branch
Considering standard deviation for weight normalization in tfidf_plus.
Showing
2 changed files
with
12 additions
and
5 deletions
Show diff stats
src/data.py
| ... | ... | @@ -68,7 +68,7 @@ def print_index(index): |
| 68 | 68 | output += "\n---" |
| 69 | 69 | return output |
| 70 | 70 | |
| 71 | -def tfidf_weighting(index,docs,content_filter,plus=0): | |
| 71 | +def tfidf_weighting(index,docs,content_filter,normalized_weigths=0): | |
| 72 | 72 | """ |
| 73 | 73 | Return a dictionary of terms and weights of all terms of a set of |
| 74 | 74 | documents, based on the frequency of terms in the selected set (docids). |
| ... | ... | @@ -78,8 +78,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0): |
| 78 | 78 | for d in docs: |
| 79 | 79 | for term in index.get_document(d.docid).termlist(): |
| 80 | 80 | if content_filter(term.term): |
| 81 | - if plus: | |
| 82 | - terms_doc.add_term(term.term,int(d.weight)) | |
| 81 | + if normalized_weigths: | |
| 82 | + terms_doc.add_term(term.term,int(math.ceil(normalized_weigths[d.docid]))) | |
| 83 | 83 | else: |
| 84 | 84 | terms_doc.add_term(term.term) |
| 85 | 85 | # Compute sublinear tfidf for each term |
| ... | ... | @@ -104,7 +104,14 @@ def tfidf_plus(index,docs,content_filter): |
| 104 | 104 | Return a dictionary of terms and weights of all terms of a set of |
| 105 | 105 | documents, based on the frequency of terms in the selected set (docids). |
| 106 | 106 | """ |
| 107 | - return tfidf_weighting(index,docs,content_filter,1) | |
| 107 | + normalized_weigths = {} | |
| 108 | + population = [d.weight for d in docs] | |
| 109 | + mean = sum(population)/len(population) | |
| 110 | + variance = sum([(p-mean)*(p-mean) for p in population])/len(population) | |
| 111 | + standard_deviation = math.sqrt(variance) | |
| 112 | + for d in docs: | |
| 113 | + normalized_weigths[d.docid] = d.weight/standard_deviation | |
| 114 | + return tfidf_weighting(index,docs,content_filter,normalized_weigths) | |
| 108 | 115 | |
| 109 | 116 | class AppAptXapianIndex(xapian.WritableDatabase): |
| 110 | 117 | """ | ... | ... |
src/strategy.py
| ... | ... | @@ -212,7 +212,7 @@ class KnnPlus(Collaborative): |
| 212 | 212 | KNN based packages tf-idf weights. |
| 213 | 213 | """ |
| 214 | 214 | def __init__(self,k): |
| 215 | - self.description = "Knn" | |
| 215 | + self.description = "Knn plus" | |
| 216 | 216 | self.neighbours = k |
| 217 | 217 | |
| 218 | 218 | def run(self,rec,user,recommendation_size): | ... | ... |