Commit b1904ec8a298b5042034d30b953c4722d1c15127
Exists in
master
and in
1 other branch
Merge branch 'master' of https://github.com/tassia/AppRecommender
Showing
4 changed files
with
28 additions
and
17 deletions
Show diff stats
src/bin/cross_validation.py
| ... | ... | @@ -27,7 +27,7 @@ import logging |
| 27 | 27 | import datetime |
| 28 | 28 | |
| 29 | 29 | from config import Config |
| 30 | -from evaluation import CrossValidation, Precision, Recall, F1, Accuracy, SimpleAccuracy | |
| 30 | +from evaluation import CrossValidation, Precision, Recall, F_score, FPR, Accuracy | |
| 31 | 31 | from recommender import Recommender |
| 32 | 32 | from user import RandomPopcon,LocalSystem,PopconSystem |
| 33 | 33 | |
| ... | ... | @@ -45,10 +45,10 @@ if __name__ == '__main__': |
| 45 | 45 | metrics = [] |
| 46 | 46 | metrics.append(Precision()) |
| 47 | 47 | metrics.append(Recall()) |
| 48 | - metrics.append(F1()) | |
| 48 | + metrics.append(F_score(0.5)) | |
| 49 | 49 | metrics.append(Accuracy()) |
| 50 | - metrics.append(SimpleAccuracy()) | |
| 51 | - validation = CrossValidation(0.9,10,rec,metrics,0.005) | |
| 50 | + metrics.append(FPR()) | |
| 51 | + validation = CrossValidation(0.9,10,rec,metrics,1) | |
| 52 | 52 | validation.run(user) |
| 53 | 53 | print validation |
| 54 | 54 | ... | ... |
src/data.py
| ... | ... | @@ -80,7 +80,7 @@ def print_index(index): |
| 80 | 80 | output += "\n---" |
| 81 | 81 | return output |
| 82 | 82 | |
| 83 | -def tfidf_weighting(index,docs,content_filter,plus=0): | |
| 83 | +def tfidf_weighting(index,docs,content_filter,normalized_weigths=0): | |
| 84 | 84 | """ |
| 85 | 85 | Return a dictionary of terms and weights of all terms of a set of |
| 86 | 86 | documents, based on the frequency of terms in the selected set (docids). |
| ... | ... | @@ -90,8 +90,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0): |
| 90 | 90 | for d in docs: |
| 91 | 91 | for term in index.get_document(d.docid).termlist(): |
| 92 | 92 | if content_filter(term.term): |
| 93 | - if plus: | |
| 94 | - terms_doc.add_term(term.term,int(d.weight)) | |
| 93 | + if normalized_weigths: | |
| 94 | + terms_doc.add_term(term.term,int(math.ceil(normalized_weigths[d.docid]))) | |
| 95 | 95 | else: |
| 96 | 96 | terms_doc.add_term(term.term) |
| 97 | 97 | # Compute sublinear tfidf for each term |
| ... | ... | @@ -116,7 +116,14 @@ def tfidf_plus(index,docs,content_filter): |
| 116 | 116 | Return a dictionary of terms and weights of all terms of a set of |
| 117 | 117 | documents, based on the frequency of terms in the selected set (docids). |
| 118 | 118 | """ |
| 119 | - return tfidf_weighting(index,docs,content_filter,1) | |
| 119 | + normalized_weigths = {} | |
| 120 | + population = [d.weight for d in docs] | |
| 121 | + mean = sum(population)/len(population) | |
| 122 | + variance = sum([(p-mean)*(p-mean) for p in population])/len(population) | |
| 123 | + standard_deviation = math.sqrt(variance) | |
| 124 | + for d in docs: | |
| 125 | + normalized_weigths[d.docid] = d.weight/standard_deviation | |
| 126 | + return tfidf_weighting(index,docs,content_filter,normalized_weigths) | |
| 120 | 127 | |
| 121 | 128 | class FilteredXapianIndex(xapian.WritableDatabase): |
| 122 | 129 | """ | ... | ... |
src/evaluation.py
| ... | ... | @@ -137,7 +137,8 @@ class FPR(Metric): |
| 137 | 137 | """ |
| 138 | 138 | Compute metric. |
| 139 | 139 | """ |
| 140 | - return float(len(evaluation.false_positive))/evaluation.true_negatives_len | |
| 140 | + return (float(len(evaluation.false_positive))/ | |
| 141 | + evaluation.real_negative_len) | |
| 141 | 142 | |
| 142 | 143 | class F_score(Metric): |
| 143 | 144 | """ |
| ... | ... | @@ -148,7 +149,7 @@ class F_score(Metric): |
| 148 | 149 | """ |
| 149 | 150 | Set metric description. |
| 150 | 151 | """ |
| 151 | - self.desc = " F_score " | |
| 152 | + self.desc = " F(%.1f) " % k | |
| 152 | 153 | self.k = k |
| 153 | 154 | |
| 154 | 155 | def run(self,evaluation): |
| ... | ... | @@ -254,12 +255,15 @@ class Evaluation: |
| 254 | 255 | self.false_negative = [v[0] for v in self.real_relevant if not v[0] in |
| 255 | 256 | [w[0] for w in self.predicted_relevant]] |
| 256 | 257 | |
| 257 | - self.true_negatives_len = self.repository_size - len(self.real_relevant) | |
| 258 | - #logging.debug("TP: %d" % len(self.true_positive)) | |
| 259 | - #logging.debug("FP: %d" % len(self.false_positive)) | |
| 260 | - #logging.debug("FN: %d" % len(self.false_negative)) | |
| 261 | - #logging.debug("Repo_size: %d" % self.repository_size) | |
| 262 | - #logging.debug("Relevant: %d" % len(self.real_relevant)) | |
| 258 | + self.real_negative_len = self.repository_size-len(self.real_relevant) | |
| 259 | + self.true_negative_len = (self.real_negative_len-len(self.false_positive)) | |
| 260 | + logging.debug("TP: %d" % len(self.true_positive)) | |
| 261 | + logging.debug("FP: %d" % len(self.false_positive)) | |
| 262 | + logging.debug("FN: %d" % len(self.false_negative)) | |
| 263 | + logging.debug("TN: %d" % self.true_negative_len) | |
| 264 | + logging.debug("Repo_size: %d" % self.repository_size) | |
| 265 | + logging.debug("Relevant: %d" % len(self.real_relevant)) | |
| 266 | + logging.debug("Irrelevant: %d" % self.real_negative_len) | |
| 263 | 267 | |
| 264 | 268 | def run(self,metric): |
| 265 | 269 | """ | ... | ... |
src/strategy.py
| ... | ... | @@ -212,7 +212,7 @@ class KnnPlus(Collaborative): |
| 212 | 212 | KNN based packages tf-idf weights. |
| 213 | 213 | """ |
| 214 | 214 | def __init__(self,k): |
| 215 | - self.description = "Knn" | |
| 215 | + self.description = "Knn plus" | |
| 216 | 216 | self.neighbours = k |
| 217 | 217 | |
| 218 | 218 | def run(self,rec,user,recommendation_size): | ... | ... |