Commit b1904ec8a298b5042034d30b953c4722d1c15127
Exists in
master
and in
1 other branch
Merge branch 'master' of https://github.com/tassia/AppRecommender
Showing
4 changed files
with
28 additions
and
17 deletions
Show diff stats
src/bin/cross_validation.py
... | ... | @@ -27,7 +27,7 @@ import logging |
27 | 27 | import datetime |
28 | 28 | |
29 | 29 | from config import Config |
30 | -from evaluation import CrossValidation, Precision, Recall, F1, Accuracy, SimpleAccuracy | |
30 | +from evaluation import CrossValidation, Precision, Recall, F_score, FPR, Accuracy | |
31 | 31 | from recommender import Recommender |
32 | 32 | from user import RandomPopcon,LocalSystem,PopconSystem |
33 | 33 | |
... | ... | @@ -45,10 +45,10 @@ if __name__ == '__main__': |
45 | 45 | metrics = [] |
46 | 46 | metrics.append(Precision()) |
47 | 47 | metrics.append(Recall()) |
48 | - metrics.append(F1()) | |
48 | + metrics.append(F_score(0.5)) | |
49 | 49 | metrics.append(Accuracy()) |
50 | - metrics.append(SimpleAccuracy()) | |
51 | - validation = CrossValidation(0.9,10,rec,metrics,0.005) | |
50 | + metrics.append(FPR()) | |
51 | + validation = CrossValidation(0.9,10,rec,metrics,1) | |
52 | 52 | validation.run(user) |
53 | 53 | print validation |
54 | 54 | ... | ... |
src/data.py
... | ... | @@ -80,7 +80,7 @@ def print_index(index): |
80 | 80 | output += "\n---" |
81 | 81 | return output |
82 | 82 | |
83 | -def tfidf_weighting(index,docs,content_filter,plus=0): | |
83 | +def tfidf_weighting(index,docs,content_filter,normalized_weigths=0): | |
84 | 84 | """ |
85 | 85 | Return a dictionary of terms and weights of all terms of a set of |
86 | 86 | documents, based on the frequency of terms in the selected set (docids). |
... | ... | @@ -90,8 +90,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0): |
90 | 90 | for d in docs: |
91 | 91 | for term in index.get_document(d.docid).termlist(): |
92 | 92 | if content_filter(term.term): |
93 | - if plus: | |
94 | - terms_doc.add_term(term.term,int(d.weight)) | |
93 | + if normalized_weigths: | |
94 | + terms_doc.add_term(term.term,int(math.ceil(normalized_weigths[d.docid]))) | |
95 | 95 | else: |
96 | 96 | terms_doc.add_term(term.term) |
97 | 97 | # Compute sublinear tfidf for each term |
... | ... | @@ -116,7 +116,14 @@ def tfidf_plus(index,docs,content_filter): |
116 | 116 | Return a dictionary of terms and weights of all terms of a set of |
117 | 117 | documents, based on the frequency of terms in the selected set (docids). |
118 | 118 | """ |
119 | - return tfidf_weighting(index,docs,content_filter,1) | |
119 | + normalized_weigths = {} | |
120 | + population = [d.weight for d in docs] | |
121 | + mean = sum(population)/len(population) | |
122 | + variance = sum([(p-mean)*(p-mean) for p in population])/len(population) | |
123 | + standard_deviation = math.sqrt(variance) | |
124 | + for d in docs: | |
125 | + normalized_weigths[d.docid] = d.weight/standard_deviation | |
126 | + return tfidf_weighting(index,docs,content_filter,normalized_weigths) | |
120 | 127 | |
121 | 128 | class FilteredXapianIndex(xapian.WritableDatabase): |
122 | 129 | """ | ... | ... |
src/evaluation.py
... | ... | @@ -137,7 +137,8 @@ class FPR(Metric): |
137 | 137 | """ |
138 | 138 | Compute metric. |
139 | 139 | """ |
140 | - return float(len(evaluation.false_positive))/evaluation.true_negatives_len | |
140 | + return (float(len(evaluation.false_positive))/ | |
141 | + evaluation.real_negative_len) | |
141 | 142 | |
142 | 143 | class F_score(Metric): |
143 | 144 | """ |
... | ... | @@ -148,7 +149,7 @@ class F_score(Metric): |
148 | 149 | """ |
149 | 150 | Set metric description. |
150 | 151 | """ |
151 | - self.desc = " F_score " | |
152 | + self.desc = " F(%.1f) " % k | |
152 | 153 | self.k = k |
153 | 154 | |
154 | 155 | def run(self,evaluation): |
... | ... | @@ -254,12 +255,15 @@ class Evaluation: |
254 | 255 | self.false_negative = [v[0] for v in self.real_relevant if not v[0] in |
255 | 256 | [w[0] for w in self.predicted_relevant]] |
256 | 257 | |
257 | - self.true_negatives_len = self.repository_size - len(self.real_relevant) | |
258 | - #logging.debug("TP: %d" % len(self.true_positive)) | |
259 | - #logging.debug("FP: %d" % len(self.false_positive)) | |
260 | - #logging.debug("FN: %d" % len(self.false_negative)) | |
261 | - #logging.debug("Repo_size: %d" % self.repository_size) | |
262 | - #logging.debug("Relevant: %d" % len(self.real_relevant)) | |
258 | + self.real_negative_len = self.repository_size-len(self.real_relevant) | |
259 | + self.true_negative_len = (self.real_negative_len-len(self.false_positive)) | |
260 | + logging.debug("TP: %d" % len(self.true_positive)) | |
261 | + logging.debug("FP: %d" % len(self.false_positive)) | |
262 | + logging.debug("FN: %d" % len(self.false_negative)) | |
263 | + logging.debug("TN: %d" % self.true_negative_len) | |
264 | + logging.debug("Repo_size: %d" % self.repository_size) | |
265 | + logging.debug("Relevant: %d" % len(self.real_relevant)) | |
266 | + logging.debug("Irrelevant: %d" % self.real_negative_len) | |
263 | 267 | |
264 | 268 | def run(self,metric): |
265 | 269 | """ | ... | ... |
src/strategy.py
... | ... | @@ -212,7 +212,7 @@ class KnnPlus(Collaborative): |
212 | 212 | KNN based packages tf-idf weights. |
213 | 213 | """ |
214 | 214 | def __init__(self,k): |
215 | - self.description = "Knn" | |
215 | + self.description = "Knn plus" | |
216 | 216 | self.neighbours = k |
217 | 217 | |
218 | 218 | def run(self,rec,user,recommendation_size): | ... | ... |