Commit 65be4b76c9e779b7f600b211e41649b0310b3eaa

Authored by Tássia Camões Araújo
2 parents e3176f2b 9e602af3
Exists in master and in 1 other branch add_vagrant

Merge remote branch 'upstream/master'

Conflicts:
	src/data.py
@@ -44,7 +44,8 @@ class Config(): @@ -44,7 +44,8 @@ class Config():
44 self.popcon_index = os.path.expanduser("~/.app-recommender/popcon_index") 44 self.popcon_index = os.path.expanduser("~/.app-recommender/popcon_index")
45 self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir") 45 self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir")
46 self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir") 46 self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir")
47 - self.index_mode = "0" # use old index 47 + self.k_medoids = 100
  48 + self.index_mode = "old"
48 self.strategy = "cb" 49 self.strategy = "cb"
49 self.weight = "bm25" 50 self.weight = "bm25"
50 self.load_options() 51 self.load_options()
@@ -65,8 +66,9 @@ class Config(): @@ -65,8 +66,9 @@ class Config():
65 print " -a, --axi=PATH Path to Apt-xapian-index" 66 print " -a, --axi=PATH Path to Apt-xapian-index"
66 print " -p, --popconindex=PATH Path to popcon dedicated index" 67 print " -p, --popconindex=PATH Path to popcon dedicated index"
67 print " -m, --popcondir=PATH Path to popcon submissions dir" 68 print " -m, --popcondir=PATH Path to popcon submissions dir"
68 - print " -u, --index_mode= 0: old, 1:reindex, 11:clustered_index" 69 + print " -u, --indexmode= old, reindex, cluster, recluster"
69 print " -l, --clustersdir=PATH Path to popcon clusters dir" 70 print " -l, --clustersdir=PATH Path to popcon clusters dir"
  71 + print " -e, --medoids=k Number of medoids for clustering"
70 print " -w, --weight=OPTION Search weighting scheme" 72 print " -w, --weight=OPTION Search weighting scheme"
71 print " -s, --strategy=OPTION Recommendation strategy" 73 print " -s, --strategy=OPTION Recommendation strategy"
72 print "" 74 print ""
@@ -115,13 +117,14 @@ class Config(): @@ -115,13 +117,14 @@ class Config():
115 self.popcon_dir = self.read_option('recommender', 'popcon_dir') 117 self.popcon_dir = self.read_option('recommender', 'popcon_dir')
116 self.index_mode = self.read_option('recommender', 'index_mode') 118 self.index_mode = self.read_option('recommender', 'index_mode')
117 self.clusters_dir = self.read_option('recommender', 'clusters_dir') 119 self.clusters_dir = self.read_option('recommender', 'clusters_dir')
  120 + self.k_medoids = self.read_option('recommender', 'k_medoids')
118 self.weight = self.read_option('recommender', 'weight') 121 self.weight = self.read_option('recommender', 'weight')
119 self.strategy = self.read_option('recommender', 'strategy') 122 self.strategy = self.read_option('recommender', 'strategy')
120 123
121 - short_options = "hdvo:c:a:p:m:ul:w:s:" 124 + short_options = "hdvo:c:a:p:m:ul:e:w:s:"
122 long_options = ["help", "debug", "verbose", "output=", "config=", 125 long_options = ["help", "debug", "verbose", "output=", "config=",
123 - "axi=", "popconindex=", "popcondir=", "index_mode=",  
124 - "clusters_dir=", "weight=", "strategy="] 126 + "axi=", "popconindex=", "popcondir=", "indexmode=",
  127 + "clustersdir=", "kmedoids=", "weight=", "strategy="]
125 try: 128 try:
126 opts, args = getopt.getopt(sys.argv[1:], short_options, 129 opts, args = getopt.getopt(sys.argv[1:], short_options,
127 long_options) 130 long_options)
@@ -154,6 +157,8 @@ class Config(): @@ -154,6 +157,8 @@ class Config():
154 self.index_mode = p 157 self.index_mode = p
155 elif o in ("-l", "--clustersdir"): 158 elif o in ("-l", "--clustersdir"):
156 self.clusters_dir = p 159 self.clusters_dir = p
  160 + elif o in ("-e", "--kmedoids"):
  161 + self.k_medoids = p
157 elif o in ("-w", "--weight"): 162 elif o in ("-w", "--weight"):
158 self.weight = p 163 self.weight = p
159 elif o in ("-s", "--strategy"): 164 elif o in ("-s", "--strategy"):
@@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase):
129 """ 129 """
130 self.axi = xapian.Database(cfg.axi) 130 self.axi = xapian.Database(cfg.axi)
131 self.path = os.path.expanduser(cfg.popcon_index) 131 self.path = os.path.expanduser(cfg.popcon_index)
132 - if cfg.index_mode.startswith("1") or not self.load_index(): 132 + if not cfg.index_mode == "old" or not self.load_index():
133 if not os.path.exists(cfg.popcon_dir): 133 if not os.path.exists(cfg.popcon_dir):
134 os.makedirs(cfg.popcon_dir) 134 os.makedirs(cfg.popcon_dir)
135 if not os.listdir(cfg.popcon_dir): 135 if not os.listdir(cfg.popcon_dir):
136 logging.critical("Popcon dir seems to be empty.") 136 logging.critical("Popcon dir seems to be empty.")
137 raise Error 137 raise Error
138 - if cfg.index_mode == "10": 138 + if cfg.index_mode == "reindex":
139 self.source_dir = os.path.expanduser(cfg.popcon_dir) 139 self.source_dir = os.path.expanduser(cfg.popcon_dir)
140 else: 140 else:
141 self.source_dir = os.path.expanduser(cfg.clusters_dir) 141 self.source_dir = os.path.expanduser(cfg.clusters_dir)
142 if not os.path.exists(cfg.clusters_dir): 142 if not os.path.exists(cfg.clusters_dir):
143 os.makedirs(cfg.clusters_dir) 143 os.makedirs(cfg.clusters_dir)
144 - if not os.listdir(cfg.clusters_dir):  
145 - distance = JaccardDistance() 144 + if not os.listdir(cfg.clusters_dir) or \
  145 + cfg.index_mode == "recluster":
  146 + shutil.rmtree(cfg.clusters_dir,1)
  147 + os.makedirs(cfg.clusters_dir)
146 logging.info("Clustering popcon submissions from \'%s\'" 148 logging.info("Clustering popcon submissions from \'%s\'"
147 % cfg.popcon_dir) 149 % cfg.popcon_dir)
148 logging.info("Clusters will be placed at \'%s\'" 150 logging.info("Clusters will be placed at \'%s\'"
149 % cfg.clusters_dir) 151 % cfg.clusters_dir)
  152 + distance = JaccardDistance()
150 data = self.get_submissions(cfg.popcon_dir) 153 data = self.get_submissions(cfg.popcon_dir)
151 - if cfg.clustering == "Hierarchical":  
152 - self.hierarchical_clustering(data,cfg.clusters_dir,  
153 - distance)  
154 - else:  
155 - self.kmedoids_clustering(data,cfg.clusters_dir,  
156 - distance) 154 + self.cluster_dispersion = \
  155 + self.kmedoids_clustering(data, cfg.clusters_dir,
  156 + distance, cfg.k_medoids)
  157 + logging.info("Clusters dispersion: %f.2",
  158 + self.cluster_dispersion)
  159 + else:
  160 + logging.info("Using clusters from \'%s\'" %
  161 + cfg.clusters_dir)
157 self.build_index() 162 self.build_index()
158 163
159 def __str__(self): 164 def __str__(self):
@@ -167,10 +172,9 @@ class PopconXapianIndex(xapian.WritableDatabase): @@ -167,10 +172,9 @@ class PopconXapianIndex(xapian.WritableDatabase):
167 logging.info("Opening existing popcon xapian index at \'%s\'" 172 logging.info("Opening existing popcon xapian index at \'%s\'"
168 % self.path) 173 % self.path)
169 xapian.Database.__init__(self,self.path) 174 xapian.Database.__init__(self,self.path)
170 - return True 175 + return 1
171 except xapian.DatabaseError: 176 except xapian.DatabaseError:
172 logging.info("Could not open popcon index.") 177 logging.info("Could not open popcon index.")
173 - return True  
174 return 0 178 return 0
175 179
176 def build_index(self): 180 def build_index(self):
@@ -224,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): @@ -224,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase):
224 submissions.append(submission) 228 submissions.append(submission)
225 return submissions 229 return submissions
226 230
227 - def hierarchical_clustering(self,data,clusters_dir,distance,k=10):  
228 - """  
229 - Select popcon submissions from popcon_dir and place them at clusters_dir  
230 - """  
231 - cl = cluster.HierarchicalClustering(data, lambda x,y:  
232 - distance(x.packages.keys(),  
233 - y.packages.keys()))  
234 - clusters = cl.getlevel(0.5)  
235 - for c in clusters:  
236 - print "cluster"  
237 - for submission in c:  
238 - print submission.user_id  
239 -  
240 - def kmedoids_clustering(self,data,clusters_dir,distance,k=10): 231 + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids):
241 clusters = KMedoidsClustering(data,lambda x,y: 232 clusters = KMedoidsClustering(data,lambda x,y:
242 distance(x.packages.keys(), 233 distance(x.packages.keys(),
243 y.packages.keys())) 234 y.packages.keys()))
244 - medoids = clusters.getMedoids(2) 235 + medoids,dispersion = clusters.getMedoids(k_medoids)
245 for submission in medoids: 236 for submission in medoids:
246 shutil.copyfile(submission.path,os.path.join(clusters_dir, 237 shutil.copyfile(submission.path,os.path.join(clusters_dir,
247 submission.user_id)) 238 submission.user_id))
  239 + return dispersion
248 240
249 class KMedoidsClustering(cluster.KMeansClustering): 241 class KMedoidsClustering(cluster.KMeansClustering):
250 242
251 - def __init__(self,data,distance):  
252 - if len(data)<100: 243 + def __init__(self,data,distance,max_data=100):
  244 + if len(data)<max_data:
253 data_sample = data 245 data_sample = data
254 else: 246 else:
255 - data_sample = random.sample(data,100) 247 + data_sample = random.sample(data,max_data)
256 cluster.KMeansClustering.__init__(self, data_sample, distance) 248 cluster.KMeansClustering.__init__(self, data_sample, distance)
257 self.distanceMatrix = {} 249 self.distanceMatrix = {}
258 for submission in self._KMeansClustering__data: 250 for submission in self._KMeansClustering__data:
@@ -287,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): @@ -287,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering):
287 logging.debug("medoidDistance: %f" % medoidDistance) 279 logging.debug("medoidDistance: %f" % medoidDistance)
288 logging.debug("Cluster medoid: [%d] %s" % (medoid, 280 logging.debug("Cluster medoid: [%d] %s" % (medoid,
289 cluster[medoid].user_id)) 281 cluster[medoid].user_id))
290 - return cluster[medoid] 282 + return (cluster[medoid],medoidDistance)
291 283
292 def assign_item(self, item, origin): 284 def assign_item(self, item, origin):
293 """ 285 """
@@ -295,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): @@ -295,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering):
295 """ 287 """
296 closest_cluster = origin 288 closest_cluster = origin
297 for cluster in self._KMeansClustering__clusters: 289 for cluster in self._KMeansClustering__clusters:
298 - if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)): 290 + if self.distance(item,self.getMedoid(cluster)[0]) < \
  291 + self.distance(item,self.getMedoid(closest_cluster)[0]):
299 closest_cluster = cluster 292 closest_cluster = cluster
300 293
301 if closest_cluster != origin: 294 if closest_cluster != origin:
@@ -309,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): @@ -309,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering):
309 """ 302 """
310 Generate n clusters and return their medoids. 303 Generate n clusters and return their medoids.
311 """ 304 """
312 - medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)]  
313 - logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids])  
314 - return medoids 305 + medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)]
  306 + medoids = [m[0] for m in medoids_distances]
  307 + dispersion = sum([m[1] for m in medoids_distances])
  308 + logging.info("Clustering completed and the following medoids were found: %s" % [c.user_id for c in medoids])
  309 + return medoids,dispersion
src/evaluation.py
@@ -49,6 +49,45 @@ class Metric(Singleton): @@ -49,6 +49,45 @@ class Metric(Singleton):
49 evaluation.real_item_scores[k])) 49 evaluation.real_item_scores[k]))
50 return errors 50 return errors
51 51
  52 +
  53 +class SimpleAccuracy(Metric):
  54 + """
  55 + Classification accuracy metric which consider classes sizes.
  56 + """
  57 + def __init__(self):
  58 + """
  59 + Set metric description.
  60 + """
  61 + self.desc = " S_Accuracy "
  62 +
  63 + def run(self,evaluation):
  64 + """
  65 + Compute metric.
  66 + """
  67 + return float((evaluation.repository_size-
  68 + len(evaluation.false_positive))-
  69 + len(evaluation.false_negative))/evaluation.repository_size
  70 +
  71 +class Accuracy(Metric):
  72 + """
  73 + Classification accuracy metric which consider classes sizes.
  74 + """
  75 + def __init__(self):
  76 + """
  77 + Set metric description.
  78 + """
  79 + self.desc = " Accuracy "
  80 +
  81 + def run(self,evaluation):
  82 + """
  83 + Compute metric.
  84 + """
  85 + error_1 = (float(len(evaluation.false_positive))/
  86 + (evaluation.repository_size-len(evaluation.real_relevant)))
  87 + error_2 = (float(len(evaluation.false_negative))/len(evaluation.real_relevant))
  88 + accuracy = 1-(float(error_1+error_2)/2)
  89 + return accuracy
  90 +
52 class Precision(Metric): 91 class Precision(Metric):
53 """ 92 """
54 Classification accuracy metric defined as the percentage of relevant itens 93 Classification accuracy metric defined as the percentage of relevant itens
@@ -64,7 +103,7 @@ class Precision(Metric): @@ -64,7 +103,7 @@ class Precision(Metric):
64 """ 103 """
65 Compute metric. 104 Compute metric.
66 """ 105 """
67 - return float(len(evaluation.predicted_real))/len(evaluation.predicted_relevant) 106 + return float(len(evaluation.true_positive))/len(evaluation.predicted_relevant)
68 107
69 class Recall(Metric): 108 class Recall(Metric):
70 """ 109 """
@@ -81,7 +120,7 @@ class Recall(Metric): @@ -81,7 +120,7 @@ class Recall(Metric):
81 """ 120 """
82 Compute metric. 121 Compute metric.
83 """ 122 """
84 - return float(len(evaluation.predicted_real))/len(evaluation.real_relevant) 123 + return float(len(evaluation.true_positive))/len(evaluation.real_relevant)
85 124
86 class F1(Metric): 125 class F1(Metric):
87 """ 126 """
@@ -100,7 +139,10 @@ class F1(Metric): @@ -100,7 +139,10 @@ class F1(Metric):
100 """ 139 """
101 p = Precision().run(evaluation) 140 p = Precision().run(evaluation)
102 r = Recall().run(evaluation) 141 r = Recall().run(evaluation)
103 - return float((2*p*r))/(p+r) 142 + if (p+r)>0:
  143 + return float((2*p*r))/(p+r)
  144 + else:
  145 + return 0
104 146
105 class MAE(Metric): 147 class MAE(Metric):
106 """ 148 """
@@ -158,43 +200,47 @@ class Coverage(Metric): @@ -158,43 +200,47 @@ class Coverage(Metric):
158 Evaluation metric defined as the percentage of itens covered by the 200 Evaluation metric defined as the percentage of itens covered by the
159 recommender (have been recommended at least once). 201 recommender (have been recommended at least once).
160 """ 202 """
161 - def __init__(self,repository_size): 203 + def __init__(self):
162 """ 204 """
163 Set initial parameters. 205 Set initial parameters.
164 """ 206 """
165 self.desc = " Coverage " 207 self.desc = " Coverage "
166 - self.repository_size = repository_size  
167 - self.covered = set()  
168 -  
169 - def save_covered(self,recommended_list):  
170 - """  
171 - Register that a list of itens has been recommended.  
172 - """  
173 - self.covered.update(set(recommended_list))  
174 208
175 - def run(self,evaluation): 209 + def run(self,evaluations_set):
176 """ 210 """
177 Compute metric. 211 Compute metric.
178 """ 212 """
179 - return float(self.covered.size)/self.repository_size 213 + covered = set()
  214 + for evaluation in evaluations_set:
  215 + covered.update(set(evaluation.predicted_relevant))
  216 + return float(len(covered))/evaluation.repository_size
180 217
181 class Evaluation: 218 class Evaluation:
182 """ 219 """
183 Class designed to perform prediction evaluation, given data and metric. 220 Class designed to perform prediction evaluation, given data and metric.
184 """ 221 """
185 - def __init__(self,predicted_result,real_result): 222 + def __init__(self,predicted,real,repository_size):
186 """ 223 """
187 Set initial parameters. 224 Set initial parameters.
188 """ 225 """
189 - self.predicted_item_scores = predicted_result.item_score  
190 - self.predicted_relevant = predicted_result.get_prediction()  
191 - self.real_item_scores = real_result.item_score  
192 - self.real_relevant = real_result.get_prediction()  
193 - self.predicted_real = [v for v in self.predicted_relevant if v in  
194 - self.real_relevant]  
195 - #print len(self.predicted_relevant)  
196 - #print len(self.real_relevant)  
197 - #print len(self.predicted_real) 226 + self.repository_size = repository_size
  227 + self.predicted_item_scores = predicted.item_score
  228 + self.predicted_relevant = predicted.get_prediction()
  229 + self.real_item_scores = real.item_score
  230 + self.real_relevant = real.get_prediction()
  231 +
  232 + self.true_positive = [v[0] for v in self.predicted_relevant if v[0] in
  233 + [w[0] for w in self.real_relevant]]
  234 + self.false_positive = [v[0] for v in self.predicted_relevant if not v[0] in
  235 + [w[0] for w in self.real_relevant]]
  236 + self.false_negative = [v[0] for v in self.real_relevant if not v[0] in
  237 + [w[0] for w in self.predicted_relevant]]
  238 +
  239 + logging.debug("TP: %d" % len(self.true_positive))
  240 + logging.debug("FP: %d" % len(self.false_positive))
  241 + logging.debug("FN: %d" % len(self.false_negative))
  242 + logging.debug("Repo_size: %d" % self.repository_size)
  243 + logging.debug("Relevant: %d" % len(self.real_relevant))
198 244
199 def run(self,metric): 245 def run(self,metric):
200 """ 246 """
@@ -206,7 +252,7 @@ class CrossValidation: @@ -206,7 +252,7 @@ class CrossValidation:
206 """ 252 """
207 Class designed to perform cross-validation process. 253 Class designed to perform cross-validation process.
208 """ 254 """
209 - def __init__(self,partition_proportion,rounds,rec,metrics_list): 255 + def __init__(self,partition_proportion,rounds,rec,metrics_list,result_proportion):
210 """ 256 """
211 Set initial parameters. 257 Set initial parameters.
212 """ 258 """
@@ -219,34 +265,13 @@ class CrossValidation: @@ -219,34 +265,13 @@ class CrossValidation:
219 self.recommender = rec 265 self.recommender = rec
220 self.metrics_list = metrics_list 266 self.metrics_list = metrics_list
221 self.cross_results = defaultdict(list) 267 self.cross_results = defaultdict(list)
222 -  
223 - def __str__(self):  
224 - """  
225 - String representation of the object.  
226 - """  
227 - str = "\n"  
228 - metrics_desc = ""  
229 - for metric in self.metrics_list:  
230 - metrics_desc += "%s|" % (metric.desc)  
231 - str += "| Round |%s\n" % metrics_desc  
232 - for r in range(self.rounds):  
233 - metrics_result = ""  
234 - for metric in self.metrics_list:  
235 - metrics_result += (" %2.1f%% |" %  
236 - (self.cross_results[metric.desc][r]*100))  
237 - str += "| %d |%s\n" % (r,metrics_result)  
238 - metrics_mean = ""  
239 - for metric in self.metrics_list:  
240 - mean = float(sum(self.cross_results[metric.desc]) /  
241 - len(self.cross_results[metric.desc]))  
242 - metrics_mean += " %2.1f%% |" % (mean*100)  
243 - str += "| Mean |%s\n" % (metrics_mean)  
244 - return str 268 + self.result_proportion = result_proportion
245 269
246 def run(self,user): 270 def run(self,user):
247 """ 271 """
248 Perform cross-validation. 272 Perform cross-validation.
249 """ 273 """
  274 + #
250 cross_item_score = dict.fromkeys(user.pkg_profile,1) 275 cross_item_score = dict.fromkeys(user.pkg_profile,1)
251 partition_size = int(len(cross_item_score)*self.partition_proportion) 276 partition_size = int(len(cross_item_score)*self.partition_proportion)
252 for r in range(self.rounds): 277 for r in range(self.rounds):
@@ -258,10 +283,17 @@ class CrossValidation: @@ -258,10 +283,17 @@ class CrossValidation:
258 logging.critical("Empty cross_item_score.") 283 logging.critical("Empty cross_item_score.")
259 raise Error 284 raise Error
260 round_partition[random_key] = cross_item_score.pop(random_key) 285 round_partition[random_key] = cross_item_score.pop(random_key)
  286 + #logging.debug("Round partition: %s",str(round_partition))
  287 + #logging.debug("Cross item-score: %s",str(cross_item_score))
261 round_user = User(cross_item_score) 288 round_user = User(cross_item_score)
262 - predicted_result = self.recommender.get_recommendation(round_user)  
263 - real_result = RecommendationResult(round_partition,len(round_partition))  
264 - evaluation = Evaluation(predicted_result,real_result) 289 + result_size = int(self.recommender.items_repository.get_doccount()*
  290 + self.result_proportion)
  291 + predicted_result = self.recommender.get_recommendation(round_user,result_size)
  292 + print len(round_partition)
  293 + real_result = RecommendationResult(round_partition)
  294 + #logging.debug("Predicted result: %s",predicted_result)
  295 + evaluation = Evaluation(predicted_result,real_result,
  296 + self.recommender.items_repository.get_doccount())
265 for metric in self.metrics_list: 297 for metric in self.metrics_list:
266 result = evaluation.run(metric) 298 result = evaluation.run(metric)
267 self.cross_results[metric.desc].append(result) 299 self.cross_results[metric.desc].append(result)
@@ -269,3 +301,26 @@ class CrossValidation: @@ -269,3 +301,26 @@ class CrossValidation:
269 item,score = round_partition.popitem() 301 item,score = round_partition.popitem()
270 cross_item_score[item] = score 302 cross_item_score[item] = score
271 303
  304 + def __str__(self):
  305 + """
  306 + String representation of the object.
  307 + """
  308 + str = "\n"
  309 + metrics_desc = ""
  310 + for metric in self.metrics_list:
  311 + metrics_desc += "%s|" % (metric.desc)
  312 + str += "| Round |%s\n" % metrics_desc
  313 + for r in range(self.rounds):
  314 + metrics_result = ""
  315 + for metric in self.metrics_list:
  316 + metrics_result += (" %2.1f%% |" %
  317 + (self.cross_results[metric.desc][r]*100))
  318 + str += "| %d |%s\n" % (r,metrics_result)
  319 + metrics_mean = ""
  320 + for metric in self.metrics_list:
  321 + mean = float(sum(self.cross_results[metric.desc]) /
  322 + len(self.cross_results[metric.desc]))
  323 + metrics_mean += " %2.1f%% |" % (mean*100)
  324 + str += "| Mean |%s\n" % (metrics_mean)
  325 + return str
  326 +
src/recommender.py
@@ -45,13 +45,15 @@ class RecommendationResult: @@ -45,13 +45,15 @@ class RecommendationResult:
45 str += "%2d: %s\n" % (i,result[i][0]) 45 str += "%2d: %s\n" % (i,result[i][0])
46 return str 46 return str
47 47
48 - def get_prediction(self,limit=20): 48 + def get_prediction(self,limit=0):
49 """ 49 """
50 Return prediction based on recommendation size (number of items). 50 Return prediction based on recommendation size (number of items).
51 """ 51 """
52 - if limit > self.size: limit = self.size  
53 sorted_result = sorted(self.item_score.items(), 52 sorted_result = sorted(self.item_score.items(),
54 key=operator.itemgetter(1)) 53 key=operator.itemgetter(1))
  54 + if not limit or limit > self.size:
  55 + limit = self.size
  56 +
55 return list(reversed(sorted_result[-limit:])) 57 return list(reversed(sorted_result[-limit:]))
56 58
57 class Recommender: 59 class Recommender:
@@ -63,13 +65,12 @@ class Recommender: @@ -63,13 +65,12 @@ class Recommender:
63 Set initial parameters. 65 Set initial parameters.
64 """ 66 """
65 self.items_repository = xapian.Database(cfg.axi) 67 self.items_repository = xapian.Database(cfg.axi)
66 - self.users_repository = data.PopconXapianIndex(cfg)  
67 - #self.clustered_users_repository = data.PopconXapianIndex(cfg)  
68 self.set_strategy(cfg.strategy) 68 self.set_strategy(cfg.strategy)
69 if cfg.weight == "bm25": 69 if cfg.weight == "bm25":
70 self.weight = xapian.BM25Weight() 70 self.weight = xapian.BM25Weight()
71 else: 71 else:
72 self.weight = xapian.TradWeight() 72 self.weight = xapian.TradWeight()
  73 + self.cfg = cfg
73 74
74 def set_strategy(self,strategy_str): 75 def set_strategy(self,strategy_str):
75 """ 76 """
@@ -83,6 +84,7 @@ class Recommender: @@ -83,6 +84,7 @@ class Recommender:
83 self.strategy = strategy.ContentBasedStrategy("desc") 84 self.strategy = strategy.ContentBasedStrategy("desc")
84 if strategy_str == "col": 85 if strategy_str == "col":
85 self.strategy = strategy.CollaborativeStrategy(20) 86 self.strategy = strategy.CollaborativeStrategy(20)
  87 + self.users_repository = data.PopconXapianIndex(self.cfg)
86 88
87 def get_recommendation(self,user,result_size=20): 89 def get_recommendation(self,user,result_size=20):
88 """ 90 """
src/tests/data_tests.py
@@ -71,13 +71,13 @@ class PopconXapianIndexTests(unittest2.TestCase): @@ -71,13 +71,13 @@ class PopconXapianIndexTests(unittest2.TestCase):
71 71
72 def test_reindex(self): 72 def test_reindex(self):
73 # force reindex with no clustering 73 # force reindex with no clustering
74 - self.cfg.index_mode = "10" 74 + self.cfg.index_mode = "reindex"
75 pxi = PopconXapianIndex(self.cfg) 75 pxi = PopconXapianIndex(self.cfg)
76 self.assertEqual(pxi.get_metadata("old"),"") 76 self.assertEqual(pxi.get_metadata("old"),"")
77 77
78 def test_clustering(self): 78 def test_clustering(self):
79 # force reindex with clustering 79 # force reindex with clustering
80 - self.cfg.index_mode = "11" 80 + self.cfg.index_mode = "cluster"
81 pxi = PopconXapianIndex(self.cfg) 81 pxi = PopconXapianIndex(self.cfg)
82 self.assertEqual(pxi.source_dir,self.cfg.clusters_dir) 82 self.assertEqual(pxi.source_dir,self.cfg.clusters_dir)
83 all_submissions = [submissions for (root, dirs, submissions) in 83 all_submissions = [submissions for (root, dirs, submissions) in
@@ -95,6 +95,13 @@ class PopconXapianIndexTests(unittest2.TestCase): @@ -95,6 +95,13 @@ class PopconXapianIndexTests(unittest2.TestCase):
95 sum([len(submissions) for submissions in 95 sum([len(submissions) for submissions in
96 all_submissions])) 96 all_submissions]))
97 97
  98 + def test_recluster(self):
  99 + # force reindexing and clustering
  100 + self.cfg.index_mode = "recluster"
  101 + self.cfg.k_medoids = 2
  102 + pxi = PopconXapianIndex(self.cfg)
  103 + self.assertEqual(pxi.source_dir,self.cfg.clusters_dir)
  104 + self.assertEqual(pxi.get_doccount(),2)
98 105
99 if __name__ == '__main__': 106 if __name__ == '__main__':
100 unittest2.main() 107 unittest2.main()
src/tests/test_data/popcon_dir/test_popcon_0
1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_0 ARCH:i386 POPCONVER:1.52 1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_0 ARCH:i386 POPCONVER:1.52
2 -1309407475 1303670994 perl-base /usr/bin/perl  
3 -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so  
4 -1309407450 1303670973 libc6 /lib/ld-2.11.2.so 2 +1309407475 1303670994 gimp /usr/bin/perl
  3 +1309407451 1303670982 inkscape /lib/i686/cmov/libc-2.11.2.so
  4 +1309407450 1303670973 imagination /lib/ld-2.11.2.so
5 1309407434 1295654294 dash /bin/dash 5 1309407434 1295654294 dash /bin/dash
6 0 0 libusbmuxd1 <NOFILES> 6 0 0 libusbmuxd1 <NOFILES>
7 END-POPULARITY-CONTEST-0 TIME:1309407492 7 END-POPULARITY-CONTEST-0 TIME:1309407492
src/tests/test_data/popcon_dir/test_popcon_1
1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_1 ARCH:i386 POPCONVER:1.52 1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_1 ARCH:i386 POPCONVER:1.52
2 -1309407475 1303670994 perl-base /usr/bin/perl  
3 -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so  
4 -1309407450 1303670973 libc6 /lib/ld-2.11.2.so 2 +1309407475 1303670994 gimp /usr/bin/perl
5 1309407434 1295654294 dash /bin/dash 3 1309407434 1295654294 dash /bin/dash
6 0 0 libusbmuxd1 <NOFILES> 4 0 0 libusbmuxd1 <NOFILES>
7 END-POPULARITY-CONTEST-0 TIME:1309407492 5 END-POPULARITY-CONTEST-0 TIME:1309407492
src/tests/test_data/popcon_dir/test_popcon_2
1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_2 ARCH:i386 POPCONVER:1.52 1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_2 ARCH:i386 POPCONVER:1.52
2 -1309407475 1303670994 perl-base /usr/bin/perl  
3 -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so 2 +1309407475 1303670994 iceweasel /usr/bin/perl
  3 +1309407451 1303670982 python /lib/i686/cmov/libc-2.11.2.so
4 1309407450 1303670973 libc6 /lib/ld-2.11.2.so 4 1309407450 1303670973 libc6 /lib/ld-2.11.2.so
5 1309407434 1295654294 dash /bin/dash 5 1309407434 1295654294 dash /bin/dash
6 0 0 libusbmuxd1 <NOFILES> 6 0 0 libusbmuxd1 <NOFILES>
src/tests/test_data/popcon_dir/test_popcon_3
1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_3 ARCH:i386 POPCONVER:1.52 1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_3 ARCH:i386 POPCONVER:1.52
2 -1309407475 1303670994 perl-base /usr/bin/perl  
3 -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so 2 +1309407475 1303670994 eog /usr/bin/perl
  3 +1309407451 1303670982 nautilus /lib/i686/cmov/libc-2.11.2.so
4 1309407450 1303670973 libc6 /lib/ld-2.11.2.so 4 1309407450 1303670973 libc6 /lib/ld-2.11.2.so
5 -1309407434 1295654294 dash /bin/dash  
6 0 0 libusbmuxd1 <NOFILES> 5 0 0 libusbmuxd1 <NOFILES>
7 END-POPULARITY-CONTEST-0 TIME:1309407492 6 END-POPULARITY-CONTEST-0 TIME:1309407492
src/tests/test_data/popcon_dir/test_popcon_4
1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_4 ARCH:i386 POPCONVER:1.52 1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_4 ARCH:i386 POPCONVER:1.52
2 -1309407475 1303670994 perl-base /usr/bin/perl  
3 -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so 2 +1309407475 1303670994 konqueror /usr/bin/perl
  3 +1309407451 1303670982 kedit /lib/i686/cmov/libc-2.11.2.so
4 1309407450 1303670973 libc6 /lib/ld-2.11.2.so 4 1309407450 1303670973 libc6 /lib/ld-2.11.2.so
5 -1309407434 1295654294 dash /bin/dash  
6 0 0 libusbmuxd1 <NOFILES> 5 0 0 libusbmuxd1 <NOFILES>
7 END-POPULARITY-CONTEST-0 TIME:1309407492 6 END-POPULARITY-CONTEST-0 TIME:1309407492
src/tests/test_data/popcon_dir/test_popcon_5
1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_5 ARCH:i386 POPCONVER:1.52 1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_5 ARCH:i386 POPCONVER:1.52
2 -1309407475 1303670994 perl-base /usr/bin/perl  
3 -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so  
4 -1309407450 1303670973 libc6 /lib/ld-2.11.2.so 2 +1309407475 1303670994 konqueror /usr/bin/perl
5 1309407434 1295654294 dash /bin/dash 3 1309407434 1295654294 dash /bin/dash
6 0 0 libusbmuxd1 <NOFILES> 4 0 0 libusbmuxd1 <NOFILES>
7 END-POPULARITY-CONTEST-0 TIME:1309407492 5 END-POPULARITY-CONTEST-0 TIME:1309407492
src/tests/test_data/popcon_dir/test_popcon_6
1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_6 ARCH:i386 POPCONVER:1.52 1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_6 ARCH:i386 POPCONVER:1.52
2 1309407475 1303670994 perl-base /usr/bin/perl 2 1309407475 1303670994 perl-base /usr/bin/perl
3 -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so  
4 -1309407450 1303670973 libc6 /lib/ld-2.11.2.so 3 +1309407451 1303670982 eog /lib/i686/cmov/libc-2.11.2.so
  4 +1309407450 1303670973 nautilus /lib/ld-2.11.2.so
5 1309407434 1295654294 dash /bin/dash 5 1309407434 1295654294 dash /bin/dash
6 0 0 libusbmuxd1 <NOFILES> 6 0 0 libusbmuxd1 <NOFILES>
7 END-POPULARITY-CONTEST-0 TIME:1309407492 7 END-POPULARITY-CONTEST-0 TIME:1309407492
src/tests/test_data/popcon_dir/test_popcon_7
1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_7 ARCH:i386 POPCONVER:1.52 1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_7 ARCH:i386 POPCONVER:1.52
2 -1309407475 1303670994 perl-base /usr/bin/perl  
3 -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so  
4 -1309407450 1303670973 libc6 /lib/ld-2.11.2.so 2 +1309407475 1303670994 apticron /usr/bin/perl
  3 +1309407451 1303670982 aptitude /lib/i686/cmov/libc-2.11.2.so
  4 +1309407450 1303670973 apt /lib/ld-2.11.2.so
5 1309407434 1295654294 dash /bin/dash 5 1309407434 1295654294 dash /bin/dash
6 0 0 libusbmuxd1 <NOFILES> 6 0 0 libusbmuxd1 <NOFILES>
7 END-POPULARITY-CONTEST-0 TIME:1309407492 7 END-POPULARITY-CONTEST-0 TIME:1309407492
src/tests/test_data/popcon_dir/test_popcon_8
1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_8 ARCH:i386 POPCONVER:1.52 1 POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_8 ARCH:i386 POPCONVER:1.52
2 -1309407475 1303670994 perl-base /usr/bin/perl  
3 -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so  
4 -1309407450 1303670973 libc6 /lib/ld-2.11.2.so 2 +1309407475 1303670994 apticron /usr/bin/perl
  3 +1309407451 1303670982 eog /lib/i686/cmov/libc-2.11.2.so
  4 +1309407450 1303670973 nautilus /lib/ld-2.11.2.so
5 1309407434 1295654294 dash /bin/dash 5 1309407434 1295654294 dash /bin/dash
6 0 0 libusbmuxd1 <NOFILES> 6 0 0 libusbmuxd1 <NOFILES>
7 END-POPULARITY-CONTEST-0 TIME:1309407492 7 END-POPULARITY-CONTEST-0 TIME:1309407492
@@ -152,6 +152,24 @@ class User: @@ -152,6 +152,24 @@ class User:
152 desc_profile = self.desc_profile(items_repository,size)[:size/2] 152 desc_profile = self.desc_profile(items_repository,size)[:size/2]
153 return tag_profile+desc_profile 153 return tag_profile+desc_profile
154 154
  155 + def app_pkg_profile(self,axi):
  156 + """
  157 + Return list of packages that are applications.
  158 + """
  159 + old_profile_size = len(self.pkg_profile)
  160 + for p in self.pkg_profile[:]: #iterate list copy
  161 + tags = data.axi_search_pkg_tags(axi,p)
  162 + try:
  163 +
  164 + if not "XTrole::program" in tags:
  165 + self.pkg_profile.remove(p)
  166 + except:
  167 + logging.debug("Package not found in axi: %s" % p)
  168 + profile_size = len(self.pkg_profile)
  169 + logging.debug("App package profile: reduced packages profile size \
  170 + from %d to %d." % (old_profile_size, profile_size))
  171 + return self.pkg_profile
  172 +
155 def maximal_pkg_profile(self): 173 def maximal_pkg_profile(self):
156 """ 174 """
157 Return list of packages that are not dependence of any other package in 175 Return list of packages that are not dependence of any other package in