Merge branch 'master' of https://github.com/tassia/AppRecommender

Tássia Camões Araújo
2 parents dc8ededf cc57f933
Showing 4 changed files with 28 additions and 17 deletions Show diff stats
src/bin/cross_validation.py
src/data.py
src/evaluation.py
src/strategy.py
@@ -27,7 +27,7 @@ import logging
 import datetime
  
 from config import Config
-from evaluation import CrossValidation, Precision, Recall, F1, Accuracy, SimpleAccuracy
+from evaluation import CrossValidation, Precision, Recall, F_score, FPR, Accuracy
 from recommender import Recommender
 from user import RandomPopcon,LocalSystem,PopconSystem
  
@@ -45,10 +45,10 @@ if __name__ == &#39;__main__&#39;:
     metrics = []
     metrics.append(Precision())
     metrics.append(Recall())
-    metrics.append(F1())
+    metrics.append(F_score(0.5))
     metrics.append(Accuracy())
-    metrics.append(SimpleAccuracy())
-    validation = CrossValidation(0.9,10,rec,metrics,0.005)
+    metrics.append(FPR())
+    validation = CrossValidation(0.9,10,rec,metrics,1)
     validation.run(user)
     print validation
  
@@ -80,7 +80,7 @@ def print_index(index):
         output += "\n---"
     return output
  
-def tfidf_weighting(index,docs,content_filter,plus=0):
+def tfidf_weighting(index,docs,content_filter,normalized_weigths=0):
     """
     Return a dictionary of terms and weights of all terms of a set of
     documents, based on the frequency of terms in the selected set (docids).
@@ -90,8 +90,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0):
     for d in docs:
         for term in index.get_document(d.docid).termlist():
             if content_filter(term.term):
-                if plus:
-                    terms_doc.add_term(term.term,int(d.weight))
+                if normalized_weigths:
+                    terms_doc.add_term(term.term,int(math.ceil(normalized_weigths[d.docid])))
                 else:
                     terms_doc.add_term(term.term)
     # Compute sublinear tfidf for each term
@@ -116,7 +116,14 @@ def tfidf_plus(index,docs,content_filter):
     Return a dictionary of terms and weights of all terms of a set of
     documents, based on the frequency of terms in the selected set (docids).
     """
-    return tfidf_weighting(index,docs,content_filter,1)
+    normalized_weigths = {}
+    population = [d.weight for d in docs]
+    mean = sum(population)/len(population)
+    variance = sum([(p-mean)*(p-mean) for p in population])/len(population)
+    standard_deviation = math.sqrt(variance)
+    for d in docs:
+        normalized_weigths[d.docid] = d.weight/standard_deviation
+    return tfidf_weighting(index,docs,content_filter,normalized_weigths)
  
 class FilteredXapianIndex(xapian.WritableDatabase):
     """
@@ -137,7 +137,8 @@ class FPR(Metric):
         """
         Compute metric.
         """
-        return float(len(evaluation.false_positive))/evaluation.true_negatives_len
+        return (float(len(evaluation.false_positive))/
+                evaluation.real_negative_len)
  
 class F_score(Metric):
     """
@@ -148,7 +149,7 @@ class F_score(Metric):
         """
         Set metric description.
         """
-        self.desc = "  F_score   "
+        self.desc = "  F(%.1f)  " % k
         self.k = k
  
     def run(self,evaluation):
@@ -254,12 +255,15 @@ class Evaluation:
         self.false_negative = [v[0] for v in self.real_relevant if not v[0] in
                                [w[0] for w in self.predicted_relevant]]
  
-        self.true_negatives_len = self.repository_size - len(self.real_relevant)
-        #logging.debug("TP: %d" % len(self.true_positive))
-        #logging.debug("FP: %d" % len(self.false_positive))
-        #logging.debug("FN: %d" % len(self.false_negative))
-        #logging.debug("Repo_size: %d" % self.repository_size)
-        #logging.debug("Relevant: %d" % len(self.real_relevant))
+        self.real_negative_len = self.repository_size-len(self.real_relevant)
+        self.true_negative_len = (self.real_negative_len-len(self.false_positive))
+        logging.debug("TP: %d" % len(self.true_positive))
+        logging.debug("FP: %d" % len(self.false_positive))
+        logging.debug("FN: %d" % len(self.false_negative))
+        logging.debug("TN: %d" % self.true_negative_len)
+        logging.debug("Repo_size: %d" % self.repository_size)
+        logging.debug("Relevant: %d" % len(self.real_relevant))
+        logging.debug("Irrelevant: %d" % self.real_negative_len)
  
     def run(self,metric):
         """
@@ -212,7 +212,7 @@ class KnnPlus(Collaborative):
     KNN based packages tf-idf weights.
     """
     def __init__(self,k):
-        self.description = "Knn"
+        self.description = "Knn plus"
         self.neighbours = k
  
     def run(self,rec,user,recommendation_size):
...	...	@@ -27,7 +27,7 @@ import logging
27	27	import datetime
28	28
29	29	from config import Config
30		-from evaluation import CrossValidation, Precision, Recall, F1, Accuracy, SimpleAccuracy
	30	+from evaluation import CrossValidation, Precision, Recall, F_score, FPR, Accuracy
31	31	from recommender import Recommender
32	32	from user import RandomPopcon,LocalSystem,PopconSystem
33	33
...	...	@@ -45,10 +45,10 @@ if __name__ == '__main__':
45	45	metrics = []
46	46	metrics.append(Precision())
47	47	metrics.append(Recall())
48		- metrics.append(F1())
	48	+ metrics.append(F_score(0.5))
49	49	metrics.append(Accuracy())
50		- metrics.append(SimpleAccuracy())
51		- validation = CrossValidation(0.9,10,rec,metrics,0.005)
	50	+ metrics.append(FPR())
	51	+ validation = CrossValidation(0.9,10,rec,metrics,1)
52	52	validation.run(user)
53	53	print validation
54	54
...	...
...	...	@@ -80,7 +80,7 @@ def print_index(index):
80	80	output += "\n---"
81	81	return output
82	82
83		-def tfidf_weighting(index,docs,content_filter,plus=0):
	83	+def tfidf_weighting(index,docs,content_filter,normalized_weigths=0):
84	84	"""
85	85	Return a dictionary of terms and weights of all terms of a set of
86	86	documents, based on the frequency of terms in the selected set (docids).
...	...	@@ -90,8 +90,8 @@ def tfidf_weighting(index,docs,content_filter,plus=0):
90	90	for d in docs:
91	91	for term in index.get_document(d.docid).termlist():
92	92	if content_filter(term.term):
93		- if plus:
94		- terms_doc.add_term(term.term,int(d.weight))
	93	+ if normalized_weigths:
	94	+ terms_doc.add_term(term.term,int(math.ceil(normalized_weigths[d.docid])))
95	95	else:
96	96	terms_doc.add_term(term.term)
97	97	# Compute sublinear tfidf for each term
...	...	@@ -116,7 +116,14 @@ def tfidf_plus(index,docs,content_filter):
116	116	Return a dictionary of terms and weights of all terms of a set of
117	117	documents, based on the frequency of terms in the selected set (docids).
118	118	"""
119		- return tfidf_weighting(index,docs,content_filter,1)
	119	+ normalized_weigths = {}
	120	+ population = [d.weight for d in docs]
	121	+ mean = sum(population)/len(population)
	122	+ variance = sum([(p-mean)*(p-mean) for p in population])/len(population)
	123	+ standard_deviation = math.sqrt(variance)
	124	+ for d in docs:
	125	+ normalized_weigths[d.docid] = d.weight/standard_deviation
	126	+ return tfidf_weighting(index,docs,content_filter,normalized_weigths)
120	127
121	128	class FilteredXapianIndex(xapian.WritableDatabase):
122	129	"""
...	...
...	...	@@ -137,7 +137,8 @@ class FPR(Metric):
137	137	"""
138	138	Compute metric.
139	139	"""
140		- return float(len(evaluation.false_positive))/evaluation.true_negatives_len
	140	+ return (float(len(evaluation.false_positive))/
	141	+ evaluation.real_negative_len)
141	142
142	143	class F_score(Metric):
143	144	"""
...	...	@@ -148,7 +149,7 @@ class F_score(Metric):
148	149	"""
149	150	Set metric description.
150	151	"""
151		- self.desc = " F_score "
	152	+ self.desc = " F(%.1f) " % k
152	153	self.k = k
153	154
154	155	def run(self,evaluation):
...	...	@@ -254,12 +255,15 @@ class Evaluation:
254	255	self.false_negative = [v[0] for v in self.real_relevant if not v[0] in
255	256	[w[0] for w in self.predicted_relevant]]
256	257
257		- self.true_negatives_len = self.repository_size - len(self.real_relevant)
258		- #logging.debug("TP: %d" % len(self.true_positive))
259		- #logging.debug("FP: %d" % len(self.false_positive))
260		- #logging.debug("FN: %d" % len(self.false_negative))
261		- #logging.debug("Repo_size: %d" % self.repository_size)
262		- #logging.debug("Relevant: %d" % len(self.real_relevant))
	258	+ self.real_negative_len = self.repository_size-len(self.real_relevant)
	259	+ self.true_negative_len = (self.real_negative_len-len(self.false_positive))
	260	+ logging.debug("TP: %d" % len(self.true_positive))
	261	+ logging.debug("FP: %d" % len(self.false_positive))
	262	+ logging.debug("FN: %d" % len(self.false_negative))
	263	+ logging.debug("TN: %d" % self.true_negative_len)
	264	+ logging.debug("Repo_size: %d" % self.repository_size)
	265	+ logging.debug("Relevant: %d" % len(self.real_relevant))
	266	+ logging.debug("Irrelevant: %d" % self.real_negative_len)
263	267
264	268	def run(self,metric):
265	269	"""
...	...
...	...	@@ -212,7 +212,7 @@ class KnnPlus(Collaborative):
212	212	KNN based packages tf-idf weights.
213	213	"""
214	214	def __init__(self,k):
215		- self.description = "Knn"
	215	+ self.description = "Knn plus"
216	216	self.neighbours = k
217	217
218	218	def run(self,rec,user,recommendation_size):
...	...