Commit cc57f9339ba0e68efa15ace33cb962f82128b9ea
1 parent
23c96b0a
Exists in
master
and in
1 other branch
Considering standard deviation for weight normalization in tfidf_plus.
Showing
2 changed files
with
12 additions
and
5 deletions
Show diff stats
src/data.py
| @@ -68,7 +68,7 @@ def print_index(index): | @@ -68,7 +68,7 @@ def print_index(index): | ||
| 68 | output += "\n---" | 68 | output += "\n---" |
| 69 | return output | 69 | return output |
| 70 | 70 | ||
| 71 | -def tfidf_weighting(index,docs,content_filter,plus=0): | 71 | +def tfidf_weighting(index,docs,content_filter,normalized_weigths=0): |
| 72 | """ | 72 | """ |
| 73 | Return a dictionary of terms and weights of all terms of a set of | 73 | Return a dictionary of terms and weights of all terms of a set of |
| 74 | documents, based on the frequency of terms in the selected set (docids). | 74 | documents, based on the frequency of terms in the selected set (docids). |
| @@ -78,8 +78,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0): | @@ -78,8 +78,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0): | ||
| 78 | for d in docs: | 78 | for d in docs: |
| 79 | for term in index.get_document(d.docid).termlist(): | 79 | for term in index.get_document(d.docid).termlist(): |
| 80 | if content_filter(term.term): | 80 | if content_filter(term.term): |
| 81 | - if plus: | ||
| 82 | - terms_doc.add_term(term.term,int(d.weight)) | 81 | + if normalized_weigths: |
| 82 | + terms_doc.add_term(term.term,int(math.ceil(normalized_weigths[d.docid]))) | ||
| 83 | else: | 83 | else: |
| 84 | terms_doc.add_term(term.term) | 84 | terms_doc.add_term(term.term) |
| 85 | # Compute sublinear tfidf for each term | 85 | # Compute sublinear tfidf for each term |
| @@ -104,7 +104,14 @@ def tfidf_plus(index,docs,content_filter): | @@ -104,7 +104,14 @@ def tfidf_plus(index,docs,content_filter): | ||
| 104 | Return a dictionary of terms and weights of all terms of a set of | 104 | Return a dictionary of terms and weights of all terms of a set of |
| 105 | documents, based on the frequency of terms in the selected set (docids). | 105 | documents, based on the frequency of terms in the selected set (docids). |
| 106 | """ | 106 | """ |
| 107 | - return tfidf_weighting(index,docs,content_filter,1) | 107 | + normalized_weigths = {} |
| 108 | + population = [d.weight for d in docs] | ||
| 109 | + mean = sum(population)/len(population) | ||
| 110 | + variance = sum([(p-mean)*(p-mean) for p in population])/len(population) | ||
| 111 | + standard_deviation = math.sqrt(variance) | ||
| 112 | + for d in docs: | ||
| 113 | + normalized_weigths[d.docid] = d.weight/standard_deviation | ||
| 114 | + return tfidf_weighting(index,docs,content_filter,normalized_weigths) | ||
| 108 | 115 | ||
| 109 | class AppAptXapianIndex(xapian.WritableDatabase): | 116 | class AppAptXapianIndex(xapian.WritableDatabase): |
| 110 | """ | 117 | """ |
src/strategy.py
| @@ -212,7 +212,7 @@ class KnnPlus(Collaborative): | @@ -212,7 +212,7 @@ class KnnPlus(Collaborative): | ||
| 212 | KNN based packages tf-idf weights. | 212 | KNN based packages tf-idf weights. |
| 213 | """ | 213 | """ |
| 214 | def __init__(self,k): | 214 | def __init__(self,k): |
| 215 | - self.description = "Knn" | 215 | + self.description = "Knn plus" |
| 216 | self.neighbours = k | 216 | self.neighbours = k |
| 217 | 217 | ||
| 218 | def run(self,rec,user,recommendation_size): | 218 | def run(self,rec,user,recommendation_size): |