Commit 65be4b76c9e779b7f600b211e41649b0310b3eaa
Exists in
master
and in
1 other branch
Merge remote branch 'upstream/master'
Conflicts: src/data.py
Showing
15 changed files
with
198 additions
and
122 deletions
Show diff stats
src/config.py
... | ... | @@ -44,7 +44,8 @@ class Config(): |
44 | 44 | self.popcon_index = os.path.expanduser("~/.app-recommender/popcon_index") |
45 | 45 | self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir") |
46 | 46 | self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir") |
47 | - self.index_mode = "0" # use old index | |
47 | + self.k_medoids = 100 | |
48 | + self.index_mode = "old" | |
48 | 49 | self.strategy = "cb" |
49 | 50 | self.weight = "bm25" |
50 | 51 | self.load_options() |
... | ... | @@ -65,8 +66,9 @@ class Config(): |
65 | 66 | print " -a, --axi=PATH Path to Apt-xapian-index" |
66 | 67 | print " -p, --popconindex=PATH Path to popcon dedicated index" |
67 | 68 | print " -m, --popcondir=PATH Path to popcon submissions dir" |
68 | - print " -u, --index_mode= 0: old, 1:reindex, 11:clustered_index" | |
69 | + print " -u, --indexmode= old, reindex, cluster, recluster" | |
69 | 70 | print " -l, --clustersdir=PATH Path to popcon clusters dir" |
71 | + print " -e, --medoids=k Number of medoids for clustering" | |
70 | 72 | print " -w, --weight=OPTION Search weighting scheme" |
71 | 73 | print " -s, --strategy=OPTION Recommendation strategy" |
72 | 74 | print "" |
... | ... | @@ -115,13 +117,14 @@ class Config(): |
115 | 117 | self.popcon_dir = self.read_option('recommender', 'popcon_dir') |
116 | 118 | self.index_mode = self.read_option('recommender', 'index_mode') |
117 | 119 | self.clusters_dir = self.read_option('recommender', 'clusters_dir') |
120 | + self.k_medoids = self.read_option('recommender', 'k_medoids') | |
118 | 121 | self.weight = self.read_option('recommender', 'weight') |
119 | 122 | self.strategy = self.read_option('recommender', 'strategy') |
120 | 123 | |
121 | - short_options = "hdvo:c:a:p:m:ul:w:s:" | |
124 | + short_options = "hdvo:c:a:p:m:ul:e:w:s:" | |
122 | 125 | long_options = ["help", "debug", "verbose", "output=", "config=", |
123 | - "axi=", "popconindex=", "popcondir=", "index_mode=", | |
124 | - "clusters_dir=", "weight=", "strategy="] | |
126 | + "axi=", "popconindex=", "popcondir=", "indexmode=", | |
127 | + "clustersdir=", "kmedoids=", "weight=", "strategy="] | |
125 | 128 | try: |
126 | 129 | opts, args = getopt.getopt(sys.argv[1:], short_options, |
127 | 130 | long_options) |
... | ... | @@ -154,6 +157,8 @@ class Config(): |
154 | 157 | self.index_mode = p |
155 | 158 | elif o in ("-l", "--clustersdir"): |
156 | 159 | self.clusters_dir = p |
160 | + elif o in ("-e", "--kmedoids"): | |
161 | + self.k_medoids = p | |
157 | 162 | elif o in ("-w", "--weight"): |
158 | 163 | self.weight = p |
159 | 164 | elif o in ("-s", "--strategy"): | ... | ... |
src/data.py
... | ... | @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): |
129 | 129 | """ |
130 | 130 | self.axi = xapian.Database(cfg.axi) |
131 | 131 | self.path = os.path.expanduser(cfg.popcon_index) |
132 | - if cfg.index_mode.startswith("1") or not self.load_index(): | |
132 | + if not cfg.index_mode == "old" or not self.load_index(): | |
133 | 133 | if not os.path.exists(cfg.popcon_dir): |
134 | 134 | os.makedirs(cfg.popcon_dir) |
135 | 135 | if not os.listdir(cfg.popcon_dir): |
136 | 136 | logging.critical("Popcon dir seems to be empty.") |
137 | 137 | raise Error |
138 | - if cfg.index_mode == "10": | |
138 | + if cfg.index_mode == "reindex": | |
139 | 139 | self.source_dir = os.path.expanduser(cfg.popcon_dir) |
140 | 140 | else: |
141 | 141 | self.source_dir = os.path.expanduser(cfg.clusters_dir) |
142 | 142 | if not os.path.exists(cfg.clusters_dir): |
143 | 143 | os.makedirs(cfg.clusters_dir) |
144 | - if not os.listdir(cfg.clusters_dir): | |
145 | - distance = JaccardDistance() | |
144 | + if not os.listdir(cfg.clusters_dir) or \ | |
145 | + cfg.index_mode == "recluster": | |
146 | + shutil.rmtree(cfg.clusters_dir,1) | |
147 | + os.makedirs(cfg.clusters_dir) | |
146 | 148 | logging.info("Clustering popcon submissions from \'%s\'" |
147 | 149 | % cfg.popcon_dir) |
148 | 150 | logging.info("Clusters will be placed at \'%s\'" |
149 | 151 | % cfg.clusters_dir) |
152 | + distance = JaccardDistance() | |
150 | 153 | data = self.get_submissions(cfg.popcon_dir) |
151 | - if cfg.clustering == "Hierarchical": | |
152 | - self.hierarchical_clustering(data,cfg.clusters_dir, | |
153 | - distance) | |
154 | - else: | |
155 | - self.kmedoids_clustering(data,cfg.clusters_dir, | |
156 | - distance) | |
154 | + self.cluster_dispersion = \ | |
155 | + self.kmedoids_clustering(data, cfg.clusters_dir, | |
156 | + distance, cfg.k_medoids) | |
157 | + logging.info("Clusters dispersion: %f.2", | |
158 | + self.cluster_dispersion) | |
159 | + else: | |
160 | + logging.info("Using clusters from \'%s\'" % | |
161 | + cfg.clusters_dir) | |
157 | 162 | self.build_index() |
158 | 163 | |
159 | 164 | def __str__(self): |
... | ... | @@ -167,10 +172,9 @@ class PopconXapianIndex(xapian.WritableDatabase): |
167 | 172 | logging.info("Opening existing popcon xapian index at \'%s\'" |
168 | 173 | % self.path) |
169 | 174 | xapian.Database.__init__(self,self.path) |
170 | - return True | |
175 | + return 1 | |
171 | 176 | except xapian.DatabaseError: |
172 | 177 | logging.info("Could not open popcon index.") |
173 | - return True | |
174 | 178 | return 0 |
175 | 179 | |
176 | 180 | def build_index(self): |
... | ... | @@ -224,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): |
224 | 228 | submissions.append(submission) |
225 | 229 | return submissions |
226 | 230 | |
227 | - def hierarchical_clustering(self,data,clusters_dir,distance,k=10): | |
228 | - """ | |
229 | - Select popcon submissions from popcon_dir and place them at clusters_dir | |
230 | - """ | |
231 | - cl = cluster.HierarchicalClustering(data, lambda x,y: | |
232 | - distance(x.packages.keys(), | |
233 | - y.packages.keys())) | |
234 | - clusters = cl.getlevel(0.5) | |
235 | - for c in clusters: | |
236 | - print "cluster" | |
237 | - for submission in c: | |
238 | - print submission.user_id | |
239 | - | |
240 | - def kmedoids_clustering(self,data,clusters_dir,distance,k=10): | |
231 | + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): | |
241 | 232 | clusters = KMedoidsClustering(data,lambda x,y: |
242 | 233 | distance(x.packages.keys(), |
243 | 234 | y.packages.keys())) |
244 | - medoids = clusters.getMedoids(2) | |
235 | + medoids,dispersion = clusters.getMedoids(k_medoids) | |
245 | 236 | for submission in medoids: |
246 | 237 | shutil.copyfile(submission.path,os.path.join(clusters_dir, |
247 | 238 | submission.user_id)) |
239 | + return dispersion | |
248 | 240 | |
249 | 241 | class KMedoidsClustering(cluster.KMeansClustering): |
250 | 242 | |
251 | - def __init__(self,data,distance): | |
252 | - if len(data)<100: | |
243 | + def __init__(self,data,distance,max_data=100): | |
244 | + if len(data)<max_data: | |
253 | 245 | data_sample = data |
254 | 246 | else: |
255 | - data_sample = random.sample(data,100) | |
247 | + data_sample = random.sample(data,max_data) | |
256 | 248 | cluster.KMeansClustering.__init__(self, data_sample, distance) |
257 | 249 | self.distanceMatrix = {} |
258 | 250 | for submission in self._KMeansClustering__data: |
... | ... | @@ -287,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): |
287 | 279 | logging.debug("medoidDistance: %f" % medoidDistance) |
288 | 280 | logging.debug("Cluster medoid: [%d] %s" % (medoid, |
289 | 281 | cluster[medoid].user_id)) |
290 | - return cluster[medoid] | |
282 | + return (cluster[medoid],medoidDistance) | |
291 | 283 | |
292 | 284 | def assign_item(self, item, origin): |
293 | 285 | """ |
... | ... | @@ -295,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): |
295 | 287 | """ |
296 | 288 | closest_cluster = origin |
297 | 289 | for cluster in self._KMeansClustering__clusters: |
298 | - if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)): | |
290 | + if self.distance(item,self.getMedoid(cluster)[0]) < \ | |
291 | + self.distance(item,self.getMedoid(closest_cluster)[0]): | |
299 | 292 | closest_cluster = cluster |
300 | 293 | |
301 | 294 | if closest_cluster != origin: |
... | ... | @@ -309,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): |
309 | 302 | """ |
310 | 303 | Generate n clusters and return their medoids. |
311 | 304 | """ |
312 | - medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | |
313 | - logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids]) | |
314 | - return medoids | |
305 | + medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | |
306 | + medoids = [m[0] for m in medoids_distances] | |
307 | + dispersion = sum([m[1] for m in medoids_distances]) | |
308 | + logging.info("Clustering completed and the following medoids were found: %s" % [c.user_id for c in medoids]) | |
309 | + return medoids,dispersion | ... | ... |
src/evaluation.py
... | ... | @@ -49,6 +49,45 @@ class Metric(Singleton): |
49 | 49 | evaluation.real_item_scores[k])) |
50 | 50 | return errors |
51 | 51 | |
52 | + | |
53 | +class SimpleAccuracy(Metric): | |
54 | + """ | |
55 | + Classification accuracy metric which consider classes sizes. | |
56 | + """ | |
57 | + def __init__(self): | |
58 | + """ | |
59 | + Set metric description. | |
60 | + """ | |
61 | + self.desc = " S_Accuracy " | |
62 | + | |
63 | + def run(self,evaluation): | |
64 | + """ | |
65 | + Compute metric. | |
66 | + """ | |
67 | + return float((evaluation.repository_size- | |
68 | + len(evaluation.false_positive))- | |
69 | + len(evaluation.false_negative))/evaluation.repository_size | |
70 | + | |
71 | +class Accuracy(Metric): | |
72 | + """ | |
73 | + Classification accuracy metric which consider classes sizes. | |
74 | + """ | |
75 | + def __init__(self): | |
76 | + """ | |
77 | + Set metric description. | |
78 | + """ | |
79 | + self.desc = " Accuracy " | |
80 | + | |
81 | + def run(self,evaluation): | |
82 | + """ | |
83 | + Compute metric. | |
84 | + """ | |
85 | + error_1 = (float(len(evaluation.false_positive))/ | |
86 | + (evaluation.repository_size-len(evaluation.real_relevant))) | |
87 | + error_2 = (float(len(evaluation.false_negative))/len(evaluation.real_relevant)) | |
88 | + accuracy = 1-(float(error_1+error_2)/2) | |
89 | + return accuracy | |
90 | + | |
52 | 91 | class Precision(Metric): |
53 | 92 | """ |
54 | 93 | Classification accuracy metric defined as the percentage of relevant itens |
... | ... | @@ -64,7 +103,7 @@ class Precision(Metric): |
64 | 103 | """ |
65 | 104 | Compute metric. |
66 | 105 | """ |
67 | - return float(len(evaluation.predicted_real))/len(evaluation.predicted_relevant) | |
106 | + return float(len(evaluation.true_positive))/len(evaluation.predicted_relevant) | |
68 | 107 | |
69 | 108 | class Recall(Metric): |
70 | 109 | """ |
... | ... | @@ -81,7 +120,7 @@ class Recall(Metric): |
81 | 120 | """ |
82 | 121 | Compute metric. |
83 | 122 | """ |
84 | - return float(len(evaluation.predicted_real))/len(evaluation.real_relevant) | |
123 | + return float(len(evaluation.true_positive))/len(evaluation.real_relevant) | |
85 | 124 | |
86 | 125 | class F1(Metric): |
87 | 126 | """ |
... | ... | @@ -100,7 +139,10 @@ class F1(Metric): |
100 | 139 | """ |
101 | 140 | p = Precision().run(evaluation) |
102 | 141 | r = Recall().run(evaluation) |
103 | - return float((2*p*r))/(p+r) | |
142 | + if (p+r)>0: | |
143 | + return float((2*p*r))/(p+r) | |
144 | + else: | |
145 | + return 0 | |
104 | 146 | |
105 | 147 | class MAE(Metric): |
106 | 148 | """ |
... | ... | @@ -158,43 +200,47 @@ class Coverage(Metric): |
158 | 200 | Evaluation metric defined as the percentage of itens covered by the |
159 | 201 | recommender (have been recommended at least once). |
160 | 202 | """ |
161 | - def __init__(self,repository_size): | |
203 | + def __init__(self): | |
162 | 204 | """ |
163 | 205 | Set initial parameters. |
164 | 206 | """ |
165 | 207 | self.desc = " Coverage " |
166 | - self.repository_size = repository_size | |
167 | - self.covered = set() | |
168 | - | |
169 | - def save_covered(self,recommended_list): | |
170 | - """ | |
171 | - Register that a list of itens has been recommended. | |
172 | - """ | |
173 | - self.covered.update(set(recommended_list)) | |
174 | 208 | |
175 | - def run(self,evaluation): | |
209 | + def run(self,evaluations_set): | |
176 | 210 | """ |
177 | 211 | Compute metric. |
178 | 212 | """ |
179 | - return float(self.covered.size)/self.repository_size | |
213 | + covered = set() | |
214 | + for evaluation in evaluations_set: | |
215 | + covered.update(set(evaluation.predicted_relevant)) | |
216 | + return float(len(covered))/evaluation.repository_size | |
180 | 217 | |
181 | 218 | class Evaluation: |
182 | 219 | """ |
183 | 220 | Class designed to perform prediction evaluation, given data and metric. |
184 | 221 | """ |
185 | - def __init__(self,predicted_result,real_result): | |
222 | + def __init__(self,predicted,real,repository_size): | |
186 | 223 | """ |
187 | 224 | Set initial parameters. |
188 | 225 | """ |
189 | - self.predicted_item_scores = predicted_result.item_score | |
190 | - self.predicted_relevant = predicted_result.get_prediction() | |
191 | - self.real_item_scores = real_result.item_score | |
192 | - self.real_relevant = real_result.get_prediction() | |
193 | - self.predicted_real = [v for v in self.predicted_relevant if v in | |
194 | - self.real_relevant] | |
195 | - #print len(self.predicted_relevant) | |
196 | - #print len(self.real_relevant) | |
197 | - #print len(self.predicted_real) | |
226 | + self.repository_size = repository_size | |
227 | + self.predicted_item_scores = predicted.item_score | |
228 | + self.predicted_relevant = predicted.get_prediction() | |
229 | + self.real_item_scores = real.item_score | |
230 | + self.real_relevant = real.get_prediction() | |
231 | + | |
232 | + self.true_positive = [v[0] for v in self.predicted_relevant if v[0] in | |
233 | + [w[0] for w in self.real_relevant]] | |
234 | + self.false_positive = [v[0] for v in self.predicted_relevant if not v[0] in | |
235 | + [w[0] for w in self.real_relevant]] | |
236 | + self.false_negative = [v[0] for v in self.real_relevant if not v[0] in | |
237 | + [w[0] for w in self.predicted_relevant]] | |
238 | + | |
239 | + logging.debug("TP: %d" % len(self.true_positive)) | |
240 | + logging.debug("FP: %d" % len(self.false_positive)) | |
241 | + logging.debug("FN: %d" % len(self.false_negative)) | |
242 | + logging.debug("Repo_size: %d" % self.repository_size) | |
243 | + logging.debug("Relevant: %d" % len(self.real_relevant)) | |
198 | 244 | |
199 | 245 | def run(self,metric): |
200 | 246 | """ |
... | ... | @@ -206,7 +252,7 @@ class CrossValidation: |
206 | 252 | """ |
207 | 253 | Class designed to perform cross-validation process. |
208 | 254 | """ |
209 | - def __init__(self,partition_proportion,rounds,rec,metrics_list): | |
255 | + def __init__(self,partition_proportion,rounds,rec,metrics_list,result_proportion): | |
210 | 256 | """ |
211 | 257 | Set initial parameters. |
212 | 258 | """ |
... | ... | @@ -219,34 +265,13 @@ class CrossValidation: |
219 | 265 | self.recommender = rec |
220 | 266 | self.metrics_list = metrics_list |
221 | 267 | self.cross_results = defaultdict(list) |
222 | - | |
223 | - def __str__(self): | |
224 | - """ | |
225 | - String representation of the object. | |
226 | - """ | |
227 | - str = "\n" | |
228 | - metrics_desc = "" | |
229 | - for metric in self.metrics_list: | |
230 | - metrics_desc += "%s|" % (metric.desc) | |
231 | - str += "| Round |%s\n" % metrics_desc | |
232 | - for r in range(self.rounds): | |
233 | - metrics_result = "" | |
234 | - for metric in self.metrics_list: | |
235 | - metrics_result += (" %2.1f%% |" % | |
236 | - (self.cross_results[metric.desc][r]*100)) | |
237 | - str += "| %d |%s\n" % (r,metrics_result) | |
238 | - metrics_mean = "" | |
239 | - for metric in self.metrics_list: | |
240 | - mean = float(sum(self.cross_results[metric.desc]) / | |
241 | - len(self.cross_results[metric.desc])) | |
242 | - metrics_mean += " %2.1f%% |" % (mean*100) | |
243 | - str += "| Mean |%s\n" % (metrics_mean) | |
244 | - return str | |
268 | + self.result_proportion = result_proportion | |
245 | 269 | |
246 | 270 | def run(self,user): |
247 | 271 | """ |
248 | 272 | Perform cross-validation. |
249 | 273 | """ |
274 | + # | |
250 | 275 | cross_item_score = dict.fromkeys(user.pkg_profile,1) |
251 | 276 | partition_size = int(len(cross_item_score)*self.partition_proportion) |
252 | 277 | for r in range(self.rounds): |
... | ... | @@ -258,10 +283,17 @@ class CrossValidation: |
258 | 283 | logging.critical("Empty cross_item_score.") |
259 | 284 | raise Error |
260 | 285 | round_partition[random_key] = cross_item_score.pop(random_key) |
286 | + #logging.debug("Round partition: %s",str(round_partition)) | |
287 | + #logging.debug("Cross item-score: %s",str(cross_item_score)) | |
261 | 288 | round_user = User(cross_item_score) |
262 | - predicted_result = self.recommender.get_recommendation(round_user) | |
263 | - real_result = RecommendationResult(round_partition,len(round_partition)) | |
264 | - evaluation = Evaluation(predicted_result,real_result) | |
289 | + result_size = int(self.recommender.items_repository.get_doccount()* | |
290 | + self.result_proportion) | |
291 | + predicted_result = self.recommender.get_recommendation(round_user,result_size) | |
292 | + print len(round_partition) | |
293 | + real_result = RecommendationResult(round_partition) | |
294 | + #logging.debug("Predicted result: %s",predicted_result) | |
295 | + evaluation = Evaluation(predicted_result,real_result, | |
296 | + self.recommender.items_repository.get_doccount()) | |
265 | 297 | for metric in self.metrics_list: |
266 | 298 | result = evaluation.run(metric) |
267 | 299 | self.cross_results[metric.desc].append(result) |
... | ... | @@ -269,3 +301,26 @@ class CrossValidation: |
269 | 301 | item,score = round_partition.popitem() |
270 | 302 | cross_item_score[item] = score |
271 | 303 | |
304 | + def __str__(self): | |
305 | + """ | |
306 | + String representation of the object. | |
307 | + """ | |
308 | + str = "\n" | |
309 | + metrics_desc = "" | |
310 | + for metric in self.metrics_list: | |
311 | + metrics_desc += "%s|" % (metric.desc) | |
312 | + str += "| Round |%s\n" % metrics_desc | |
313 | + for r in range(self.rounds): | |
314 | + metrics_result = "" | |
315 | + for metric in self.metrics_list: | |
316 | + metrics_result += (" %2.1f%% |" % | |
317 | + (self.cross_results[metric.desc][r]*100)) | |
318 | + str += "| %d |%s\n" % (r,metrics_result) | |
319 | + metrics_mean = "" | |
320 | + for metric in self.metrics_list: | |
321 | + mean = float(sum(self.cross_results[metric.desc]) / | |
322 | + len(self.cross_results[metric.desc])) | |
323 | + metrics_mean += " %2.1f%% |" % (mean*100) | |
324 | + str += "| Mean |%s\n" % (metrics_mean) | |
325 | + return str | |
326 | + | ... | ... |
src/recommender.py
... | ... | @@ -45,13 +45,15 @@ class RecommendationResult: |
45 | 45 | str += "%2d: %s\n" % (i,result[i][0]) |
46 | 46 | return str |
47 | 47 | |
48 | - def get_prediction(self,limit=20): | |
48 | + def get_prediction(self,limit=0): | |
49 | 49 | """ |
50 | 50 | Return prediction based on recommendation size (number of items). |
51 | 51 | """ |
52 | - if limit > self.size: limit = self.size | |
53 | 52 | sorted_result = sorted(self.item_score.items(), |
54 | 53 | key=operator.itemgetter(1)) |
54 | + if not limit or limit > self.size: | |
55 | + limit = self.size | |
56 | + | |
55 | 57 | return list(reversed(sorted_result[-limit:])) |
56 | 58 | |
57 | 59 | class Recommender: |
... | ... | @@ -63,13 +65,12 @@ class Recommender: |
63 | 65 | Set initial parameters. |
64 | 66 | """ |
65 | 67 | self.items_repository = xapian.Database(cfg.axi) |
66 | - self.users_repository = data.PopconXapianIndex(cfg) | |
67 | - #self.clustered_users_repository = data.PopconXapianIndex(cfg) | |
68 | 68 | self.set_strategy(cfg.strategy) |
69 | 69 | if cfg.weight == "bm25": |
70 | 70 | self.weight = xapian.BM25Weight() |
71 | 71 | else: |
72 | 72 | self.weight = xapian.TradWeight() |
73 | + self.cfg = cfg | |
73 | 74 | |
74 | 75 | def set_strategy(self,strategy_str): |
75 | 76 | """ |
... | ... | @@ -83,6 +84,7 @@ class Recommender: |
83 | 84 | self.strategy = strategy.ContentBasedStrategy("desc") |
84 | 85 | if strategy_str == "col": |
85 | 86 | self.strategy = strategy.CollaborativeStrategy(20) |
87 | + self.users_repository = data.PopconXapianIndex(self.cfg) | |
86 | 88 | |
87 | 89 | def get_recommendation(self,user,result_size=20): |
88 | 90 | """ | ... | ... |
src/tests/data_tests.py
... | ... | @@ -71,13 +71,13 @@ class PopconXapianIndexTests(unittest2.TestCase): |
71 | 71 | |
72 | 72 | def test_reindex(self): |
73 | 73 | # force reindex with no clustering |
74 | - self.cfg.index_mode = "10" | |
74 | + self.cfg.index_mode = "reindex" | |
75 | 75 | pxi = PopconXapianIndex(self.cfg) |
76 | 76 | self.assertEqual(pxi.get_metadata("old"),"") |
77 | 77 | |
78 | 78 | def test_clustering(self): |
79 | 79 | # force reindex with clustering |
80 | - self.cfg.index_mode = "11" | |
80 | + self.cfg.index_mode = "cluster" | |
81 | 81 | pxi = PopconXapianIndex(self.cfg) |
82 | 82 | self.assertEqual(pxi.source_dir,self.cfg.clusters_dir) |
83 | 83 | all_submissions = [submissions for (root, dirs, submissions) in |
... | ... | @@ -95,6 +95,13 @@ class PopconXapianIndexTests(unittest2.TestCase): |
95 | 95 | sum([len(submissions) for submissions in |
96 | 96 | all_submissions])) |
97 | 97 | |
98 | + def test_recluster(self): | |
99 | + # force reindexing and clustering | |
100 | + self.cfg.index_mode = "recluster" | |
101 | + self.cfg.k_medoids = 2 | |
102 | + pxi = PopconXapianIndex(self.cfg) | |
103 | + self.assertEqual(pxi.source_dir,self.cfg.clusters_dir) | |
104 | + self.assertEqual(pxi.get_doccount(),2) | |
98 | 105 | |
99 | 106 | if __name__ == '__main__': |
100 | 107 | unittest2.main() | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_0
1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_0 ARCH:i386 POPCONVER:1.52 |
2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
2 | +1309407475 1303670994 gimp /usr/bin/perl | |
3 | +1309407451 1303670982 inkscape /lib/i686/cmov/libc-2.11.2.so | |
4 | +1309407450 1303670973 imagination /lib/ld-2.11.2.so | |
5 | 5 | 1309407434 1295654294 dash /bin/dash |
6 | 6 | 0 0 libusbmuxd1 <NOFILES> |
7 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_1
1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_1 ARCH:i386 POPCONVER:1.52 |
2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
2 | +1309407475 1303670994 gimp /usr/bin/perl | |
5 | 3 | 1309407434 1295654294 dash /bin/dash |
6 | 4 | 0 0 libusbmuxd1 <NOFILES> |
7 | 5 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_2
1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_2 ARCH:i386 POPCONVER:1.52 |
2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
2 | +1309407475 1303670994 iceweasel /usr/bin/perl | |
3 | +1309407451 1303670982 python /lib/i686/cmov/libc-2.11.2.so | |
4 | 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so |
5 | 5 | 1309407434 1295654294 dash /bin/dash |
6 | 6 | 0 0 libusbmuxd1 <NOFILES> | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_3
1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_3 ARCH:i386 POPCONVER:1.52 |
2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
2 | +1309407475 1303670994 eog /usr/bin/perl | |
3 | +1309407451 1303670982 nautilus /lib/i686/cmov/libc-2.11.2.so | |
4 | 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so |
5 | -1309407434 1295654294 dash /bin/dash | |
6 | 5 | 0 0 libusbmuxd1 <NOFILES> |
7 | 6 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_4
1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_4 ARCH:i386 POPCONVER:1.52 |
2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
2 | +1309407475 1303670994 konqueror /usr/bin/perl | |
3 | +1309407451 1303670982 kedit /lib/i686/cmov/libc-2.11.2.so | |
4 | 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so |
5 | -1309407434 1295654294 dash /bin/dash | |
6 | 5 | 0 0 libusbmuxd1 <NOFILES> |
7 | 6 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_5
1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_5 ARCH:i386 POPCONVER:1.52 |
2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
2 | +1309407475 1303670994 konqueror /usr/bin/perl | |
5 | 3 | 1309407434 1295654294 dash /bin/dash |
6 | 4 | 0 0 libusbmuxd1 <NOFILES> |
7 | 5 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_6
1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_6 ARCH:i386 POPCONVER:1.52 |
2 | 2 | 1309407475 1303670994 perl-base /usr/bin/perl |
3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
3 | +1309407451 1303670982 eog /lib/i686/cmov/libc-2.11.2.so | |
4 | +1309407450 1303670973 nautilus /lib/ld-2.11.2.so | |
5 | 5 | 1309407434 1295654294 dash /bin/dash |
6 | 6 | 0 0 libusbmuxd1 <NOFILES> |
7 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_7
1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_7 ARCH:i386 POPCONVER:1.52 |
2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
2 | +1309407475 1303670994 apticron /usr/bin/perl | |
3 | +1309407451 1303670982 aptitude /lib/i686/cmov/libc-2.11.2.so | |
4 | +1309407450 1303670973 apt /lib/ld-2.11.2.so | |
5 | 5 | 1309407434 1295654294 dash /bin/dash |
6 | 6 | 0 0 libusbmuxd1 <NOFILES> |
7 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_8
1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_8 ARCH:i386 POPCONVER:1.52 |
2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
2 | +1309407475 1303670994 apticron /usr/bin/perl | |
3 | +1309407451 1303670982 eog /lib/i686/cmov/libc-2.11.2.so | |
4 | +1309407450 1303670973 nautilus /lib/ld-2.11.2.so | |
5 | 5 | 1309407434 1295654294 dash /bin/dash |
6 | 6 | 0 0 libusbmuxd1 <NOFILES> |
7 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/user.py
... | ... | @@ -152,6 +152,24 @@ class User: |
152 | 152 | desc_profile = self.desc_profile(items_repository,size)[:size/2] |
153 | 153 | return tag_profile+desc_profile |
154 | 154 | |
155 | + def app_pkg_profile(self,axi): | |
156 | + """ | |
157 | + Return list of packages that are applications. | |
158 | + """ | |
159 | + old_profile_size = len(self.pkg_profile) | |
160 | + for p in self.pkg_profile[:]: #iterate list copy | |
161 | + tags = data.axi_search_pkg_tags(axi,p) | |
162 | + try: | |
163 | + | |
164 | + if not "XTrole::program" in tags: | |
165 | + self.pkg_profile.remove(p) | |
166 | + except: | |
167 | + logging.debug("Package not found in axi: %s" % p) | |
168 | + profile_size = len(self.pkg_profile) | |
169 | + logging.debug("App package profile: reduced packages profile size \ | |
170 | + from %d to %d." % (old_profile_size, profile_size)) | |
171 | + return self.pkg_profile | |
172 | + | |
155 | 173 | def maximal_pkg_profile(self): |
156 | 174 | """ |
157 | 175 | Return list of packages that are not dependence of any other package in | ... | ... |