Commit b1904ec8a298b5042034d30b953c4722d1c15127

Authored by Tássia Camões Araújo
2 parents dc8ededf cc57f933
Exists in master and in 1 other branch add_vagrant

Merge branch 'master' of https://github.com/tassia/AppRecommender

src/bin/cross_validation.py
... ... @@ -27,7 +27,7 @@ import logging
27 27 import datetime
28 28  
29 29 from config import Config
30   -from evaluation import CrossValidation, Precision, Recall, F1, Accuracy, SimpleAccuracy
  30 +from evaluation import CrossValidation, Precision, Recall, F_score, FPR, Accuracy
31 31 from recommender import Recommender
32 32 from user import RandomPopcon,LocalSystem,PopconSystem
33 33  
... ... @@ -45,10 +45,10 @@ if __name__ == '__main__':
45 45 metrics = []
46 46 metrics.append(Precision())
47 47 metrics.append(Recall())
48   - metrics.append(F1())
  48 + metrics.append(F_score(0.5))
49 49 metrics.append(Accuracy())
50   - metrics.append(SimpleAccuracy())
51   - validation = CrossValidation(0.9,10,rec,metrics,0.005)
  50 + metrics.append(FPR())
  51 + validation = CrossValidation(0.9,10,rec,metrics,1)
52 52 validation.run(user)
53 53 print validation
54 54  
... ...
src/data.py
... ... @@ -80,7 +80,7 @@ def print_index(index):
80 80 output += "\n---"
81 81 return output
82 82  
83   -def tfidf_weighting(index,docs,content_filter,plus=0):
  83 +def tfidf_weighting(index,docs,content_filter,normalized_weigths=0):
84 84 """
85 85 Return a dictionary of terms and weights of all terms of a set of
86 86 documents, based on the frequency of terms in the selected set (docids).
... ... @@ -90,8 +90,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0):
90 90 for d in docs:
91 91 for term in index.get_document(d.docid).termlist():
92 92 if content_filter(term.term):
93   - if plus:
94   - terms_doc.add_term(term.term,int(d.weight))
  93 + if normalized_weigths:
  94 + terms_doc.add_term(term.term,int(math.ceil(normalized_weigths[d.docid])))
95 95 else:
96 96 terms_doc.add_term(term.term)
97 97 # Compute sublinear tfidf for each term
... ... @@ -116,7 +116,14 @@ def tfidf_plus(index,docs,content_filter):
116 116 Return a dictionary of terms and weights of all terms of a set of
117 117 documents, based on the frequency of terms in the selected set (docids).
118 118 """
119   - return tfidf_weighting(index,docs,content_filter,1)
  119 + normalized_weigths = {}
  120 + population = [d.weight for d in docs]
  121 + mean = sum(population)/len(population)
  122 + variance = sum([(p-mean)*(p-mean) for p in population])/len(population)
  123 + standard_deviation = math.sqrt(variance)
  124 + for d in docs:
  125 + normalized_weigths[d.docid] = d.weight/standard_deviation
  126 + return tfidf_weighting(index,docs,content_filter,normalized_weigths)
120 127  
121 128 class FilteredXapianIndex(xapian.WritableDatabase):
122 129 """
... ...
src/evaluation.py
... ... @@ -137,7 +137,8 @@ class FPR(Metric):
137 137 """
138 138 Compute metric.
139 139 """
140   - return float(len(evaluation.false_positive))/evaluation.true_negatives_len
  140 + return (float(len(evaluation.false_positive))/
  141 + evaluation.real_negative_len)
141 142  
142 143 class F_score(Metric):
143 144 """
... ... @@ -148,7 +149,7 @@ class F_score(Metric):
148 149 """
149 150 Set metric description.
150 151 """
151   - self.desc = " F_score "
  152 + self.desc = " F(%.1f) " % k
152 153 self.k = k
153 154  
154 155 def run(self,evaluation):
... ... @@ -254,12 +255,15 @@ class Evaluation:
254 255 self.false_negative = [v[0] for v in self.real_relevant if not v[0] in
255 256 [w[0] for w in self.predicted_relevant]]
256 257  
257   - self.true_negatives_len = self.repository_size - len(self.real_relevant)
258   - #logging.debug("TP: %d" % len(self.true_positive))
259   - #logging.debug("FP: %d" % len(self.false_positive))
260   - #logging.debug("FN: %d" % len(self.false_negative))
261   - #logging.debug("Repo_size: %d" % self.repository_size)
262   - #logging.debug("Relevant: %d" % len(self.real_relevant))
  258 + self.real_negative_len = self.repository_size-len(self.real_relevant)
  259 + self.true_negative_len = (self.real_negative_len-len(self.false_positive))
  260 + logging.debug("TP: %d" % len(self.true_positive))
  261 + logging.debug("FP: %d" % len(self.false_positive))
  262 + logging.debug("FN: %d" % len(self.false_negative))
  263 + logging.debug("TN: %d" % self.true_negative_len)
  264 + logging.debug("Repo_size: %d" % self.repository_size)
  265 + logging.debug("Relevant: %d" % len(self.real_relevant))
  266 + logging.debug("Irrelevant: %d" % self.real_negative_len)
263 267  
264 268 def run(self,metric):
265 269 """
... ...
src/strategy.py
... ... @@ -212,7 +212,7 @@ class KnnPlus(Collaborative):
212 212 KNN based packages tf-idf weights.
213 213 """
214 214 def __init__(self,k):
215   - self.description = "Knn"
  215 + self.description = "Knn plus"
216 216 self.neighbours = k
217 217  
218 218 def run(self,rec,user,recommendation_size):
... ...