Commit 65be4b76c9e779b7f600b211e41649b0310b3eaa
Exists in
master
and in
1 other branch
Merge remote branch 'upstream/master'
Conflicts: src/data.py
Showing
15 changed files
with
198 additions
and
122 deletions
Show diff stats
src/config.py
| ... | ... | @@ -44,7 +44,8 @@ class Config(): |
| 44 | 44 | self.popcon_index = os.path.expanduser("~/.app-recommender/popcon_index") |
| 45 | 45 | self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir") |
| 46 | 46 | self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir") |
| 47 | - self.index_mode = "0" # use old index | |
| 47 | + self.k_medoids = 100 | |
| 48 | + self.index_mode = "old" | |
| 48 | 49 | self.strategy = "cb" |
| 49 | 50 | self.weight = "bm25" |
| 50 | 51 | self.load_options() |
| ... | ... | @@ -65,8 +66,9 @@ class Config(): |
| 65 | 66 | print " -a, --axi=PATH Path to Apt-xapian-index" |
| 66 | 67 | print " -p, --popconindex=PATH Path to popcon dedicated index" |
| 67 | 68 | print " -m, --popcondir=PATH Path to popcon submissions dir" |
| 68 | - print " -u, --index_mode= 0: old, 1:reindex, 11:clustered_index" | |
| 69 | + print " -u, --indexmode= old, reindex, cluster, recluster" | |
| 69 | 70 | print " -l, --clustersdir=PATH Path to popcon clusters dir" |
| 71 | + print " -e, --medoids=k Number of medoids for clustering" | |
| 70 | 72 | print " -w, --weight=OPTION Search weighting scheme" |
| 71 | 73 | print " -s, --strategy=OPTION Recommendation strategy" |
| 72 | 74 | print "" |
| ... | ... | @@ -115,13 +117,14 @@ class Config(): |
| 115 | 117 | self.popcon_dir = self.read_option('recommender', 'popcon_dir') |
| 116 | 118 | self.index_mode = self.read_option('recommender', 'index_mode') |
| 117 | 119 | self.clusters_dir = self.read_option('recommender', 'clusters_dir') |
| 120 | + self.k_medoids = self.read_option('recommender', 'k_medoids') | |
| 118 | 121 | self.weight = self.read_option('recommender', 'weight') |
| 119 | 122 | self.strategy = self.read_option('recommender', 'strategy') |
| 120 | 123 | |
| 121 | - short_options = "hdvo:c:a:p:m:ul:w:s:" | |
| 124 | + short_options = "hdvo:c:a:p:m:ul:e:w:s:" | |
| 122 | 125 | long_options = ["help", "debug", "verbose", "output=", "config=", |
| 123 | - "axi=", "popconindex=", "popcondir=", "index_mode=", | |
| 124 | - "clusters_dir=", "weight=", "strategy="] | |
| 126 | + "axi=", "popconindex=", "popcondir=", "indexmode=", | |
| 127 | + "clustersdir=", "kmedoids=", "weight=", "strategy="] | |
| 125 | 128 | try: |
| 126 | 129 | opts, args = getopt.getopt(sys.argv[1:], short_options, |
| 127 | 130 | long_options) |
| ... | ... | @@ -154,6 +157,8 @@ class Config(): |
| 154 | 157 | self.index_mode = p |
| 155 | 158 | elif o in ("-l", "--clustersdir"): |
| 156 | 159 | self.clusters_dir = p |
| 160 | + elif o in ("-e", "--kmedoids"): | |
| 161 | + self.k_medoids = p | |
| 157 | 162 | elif o in ("-w", "--weight"): |
| 158 | 163 | self.weight = p |
| 159 | 164 | elif o in ("-s", "--strategy"): | ... | ... |
src/data.py
| ... | ... | @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): |
| 129 | 129 | """ |
| 130 | 130 | self.axi = xapian.Database(cfg.axi) |
| 131 | 131 | self.path = os.path.expanduser(cfg.popcon_index) |
| 132 | - if cfg.index_mode.startswith("1") or not self.load_index(): | |
| 132 | + if not cfg.index_mode == "old" or not self.load_index(): | |
| 133 | 133 | if not os.path.exists(cfg.popcon_dir): |
| 134 | 134 | os.makedirs(cfg.popcon_dir) |
| 135 | 135 | if not os.listdir(cfg.popcon_dir): |
| 136 | 136 | logging.critical("Popcon dir seems to be empty.") |
| 137 | 137 | raise Error |
| 138 | - if cfg.index_mode == "10": | |
| 138 | + if cfg.index_mode == "reindex": | |
| 139 | 139 | self.source_dir = os.path.expanduser(cfg.popcon_dir) |
| 140 | 140 | else: |
| 141 | 141 | self.source_dir = os.path.expanduser(cfg.clusters_dir) |
| 142 | 142 | if not os.path.exists(cfg.clusters_dir): |
| 143 | 143 | os.makedirs(cfg.clusters_dir) |
| 144 | - if not os.listdir(cfg.clusters_dir): | |
| 145 | - distance = JaccardDistance() | |
| 144 | + if not os.listdir(cfg.clusters_dir) or \ | |
| 145 | + cfg.index_mode == "recluster": | |
| 146 | + shutil.rmtree(cfg.clusters_dir,1) | |
| 147 | + os.makedirs(cfg.clusters_dir) | |
| 146 | 148 | logging.info("Clustering popcon submissions from \'%s\'" |
| 147 | 149 | % cfg.popcon_dir) |
| 148 | 150 | logging.info("Clusters will be placed at \'%s\'" |
| 149 | 151 | % cfg.clusters_dir) |
| 152 | + distance = JaccardDistance() | |
| 150 | 153 | data = self.get_submissions(cfg.popcon_dir) |
| 151 | - if cfg.clustering == "Hierarchical": | |
| 152 | - self.hierarchical_clustering(data,cfg.clusters_dir, | |
| 153 | - distance) | |
| 154 | - else: | |
| 155 | - self.kmedoids_clustering(data,cfg.clusters_dir, | |
| 156 | - distance) | |
| 154 | + self.cluster_dispersion = \ | |
| 155 | + self.kmedoids_clustering(data, cfg.clusters_dir, | |
| 156 | + distance, cfg.k_medoids) | |
| 157 | + logging.info("Clusters dispersion: %f.2", | |
| 158 | + self.cluster_dispersion) | |
| 159 | + else: | |
| 160 | + logging.info("Using clusters from \'%s\'" % | |
| 161 | + cfg.clusters_dir) | |
| 157 | 162 | self.build_index() |
| 158 | 163 | |
| 159 | 164 | def __str__(self): |
| ... | ... | @@ -167,10 +172,9 @@ class PopconXapianIndex(xapian.WritableDatabase): |
| 167 | 172 | logging.info("Opening existing popcon xapian index at \'%s\'" |
| 168 | 173 | % self.path) |
| 169 | 174 | xapian.Database.__init__(self,self.path) |
| 170 | - return True | |
| 175 | + return 1 | |
| 171 | 176 | except xapian.DatabaseError: |
| 172 | 177 | logging.info("Could not open popcon index.") |
| 173 | - return True | |
| 174 | 178 | return 0 |
| 175 | 179 | |
| 176 | 180 | def build_index(self): |
| ... | ... | @@ -224,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): |
| 224 | 228 | submissions.append(submission) |
| 225 | 229 | return submissions |
| 226 | 230 | |
| 227 | - def hierarchical_clustering(self,data,clusters_dir,distance,k=10): | |
| 228 | - """ | |
| 229 | - Select popcon submissions from popcon_dir and place them at clusters_dir | |
| 230 | - """ | |
| 231 | - cl = cluster.HierarchicalClustering(data, lambda x,y: | |
| 232 | - distance(x.packages.keys(), | |
| 233 | - y.packages.keys())) | |
| 234 | - clusters = cl.getlevel(0.5) | |
| 235 | - for c in clusters: | |
| 236 | - print "cluster" | |
| 237 | - for submission in c: | |
| 238 | - print submission.user_id | |
| 239 | - | |
| 240 | - def kmedoids_clustering(self,data,clusters_dir,distance,k=10): | |
| 231 | + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): | |
| 241 | 232 | clusters = KMedoidsClustering(data,lambda x,y: |
| 242 | 233 | distance(x.packages.keys(), |
| 243 | 234 | y.packages.keys())) |
| 244 | - medoids = clusters.getMedoids(2) | |
| 235 | + medoids,dispersion = clusters.getMedoids(k_medoids) | |
| 245 | 236 | for submission in medoids: |
| 246 | 237 | shutil.copyfile(submission.path,os.path.join(clusters_dir, |
| 247 | 238 | submission.user_id)) |
| 239 | + return dispersion | |
| 248 | 240 | |
| 249 | 241 | class KMedoidsClustering(cluster.KMeansClustering): |
| 250 | 242 | |
| 251 | - def __init__(self,data,distance): | |
| 252 | - if len(data)<100: | |
| 243 | + def __init__(self,data,distance,max_data=100): | |
| 244 | + if len(data)<max_data: | |
| 253 | 245 | data_sample = data |
| 254 | 246 | else: |
| 255 | - data_sample = random.sample(data,100) | |
| 247 | + data_sample = random.sample(data,max_data) | |
| 256 | 248 | cluster.KMeansClustering.__init__(self, data_sample, distance) |
| 257 | 249 | self.distanceMatrix = {} |
| 258 | 250 | for submission in self._KMeansClustering__data: |
| ... | ... | @@ -287,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): |
| 287 | 279 | logging.debug("medoidDistance: %f" % medoidDistance) |
| 288 | 280 | logging.debug("Cluster medoid: [%d] %s" % (medoid, |
| 289 | 281 | cluster[medoid].user_id)) |
| 290 | - return cluster[medoid] | |
| 282 | + return (cluster[medoid],medoidDistance) | |
| 291 | 283 | |
| 292 | 284 | def assign_item(self, item, origin): |
| 293 | 285 | """ |
| ... | ... | @@ -295,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): |
| 295 | 287 | """ |
| 296 | 288 | closest_cluster = origin |
| 297 | 289 | for cluster in self._KMeansClustering__clusters: |
| 298 | - if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)): | |
| 290 | + if self.distance(item,self.getMedoid(cluster)[0]) < \ | |
| 291 | + self.distance(item,self.getMedoid(closest_cluster)[0]): | |
| 299 | 292 | closest_cluster = cluster |
| 300 | 293 | |
| 301 | 294 | if closest_cluster != origin: |
| ... | ... | @@ -309,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): |
| 309 | 302 | """ |
| 310 | 303 | Generate n clusters and return their medoids. |
| 311 | 304 | """ |
| 312 | - medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | |
| 313 | - logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids]) | |
| 314 | - return medoids | |
| 305 | + medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | |
| 306 | + medoids = [m[0] for m in medoids_distances] | |
| 307 | + dispersion = sum([m[1] for m in medoids_distances]) | |
| 308 | + logging.info("Clustering completed and the following medoids were found: %s" % [c.user_id for c in medoids]) | |
| 309 | + return medoids,dispersion | ... | ... |
src/evaluation.py
| ... | ... | @@ -49,6 +49,45 @@ class Metric(Singleton): |
| 49 | 49 | evaluation.real_item_scores[k])) |
| 50 | 50 | return errors |
| 51 | 51 | |
| 52 | + | |
| 53 | +class SimpleAccuracy(Metric): | |
| 54 | + """ | |
| 55 | + Classification accuracy metric which consider classes sizes. | |
| 56 | + """ | |
| 57 | + def __init__(self): | |
| 58 | + """ | |
| 59 | + Set metric description. | |
| 60 | + """ | |
| 61 | + self.desc = " S_Accuracy " | |
| 62 | + | |
| 63 | + def run(self,evaluation): | |
| 64 | + """ | |
| 65 | + Compute metric. | |
| 66 | + """ | |
| 67 | + return float((evaluation.repository_size- | |
| 68 | + len(evaluation.false_positive))- | |
| 69 | + len(evaluation.false_negative))/evaluation.repository_size | |
| 70 | + | |
| 71 | +class Accuracy(Metric): | |
| 72 | + """ | |
| 73 | + Classification accuracy metric which consider classes sizes. | |
| 74 | + """ | |
| 75 | + def __init__(self): | |
| 76 | + """ | |
| 77 | + Set metric description. | |
| 78 | + """ | |
| 79 | + self.desc = " Accuracy " | |
| 80 | + | |
| 81 | + def run(self,evaluation): | |
| 82 | + """ | |
| 83 | + Compute metric. | |
| 84 | + """ | |
| 85 | + error_1 = (float(len(evaluation.false_positive))/ | |
| 86 | + (evaluation.repository_size-len(evaluation.real_relevant))) | |
| 87 | + error_2 = (float(len(evaluation.false_negative))/len(evaluation.real_relevant)) | |
| 88 | + accuracy = 1-(float(error_1+error_2)/2) | |
| 89 | + return accuracy | |
| 90 | + | |
| 52 | 91 | class Precision(Metric): |
| 53 | 92 | """ |
| 54 | 93 | Classification accuracy metric defined as the percentage of relevant itens |
| ... | ... | @@ -64,7 +103,7 @@ class Precision(Metric): |
| 64 | 103 | """ |
| 65 | 104 | Compute metric. |
| 66 | 105 | """ |
| 67 | - return float(len(evaluation.predicted_real))/len(evaluation.predicted_relevant) | |
| 106 | + return float(len(evaluation.true_positive))/len(evaluation.predicted_relevant) | |
| 68 | 107 | |
| 69 | 108 | class Recall(Metric): |
| 70 | 109 | """ |
| ... | ... | @@ -81,7 +120,7 @@ class Recall(Metric): |
| 81 | 120 | """ |
| 82 | 121 | Compute metric. |
| 83 | 122 | """ |
| 84 | - return float(len(evaluation.predicted_real))/len(evaluation.real_relevant) | |
| 123 | + return float(len(evaluation.true_positive))/len(evaluation.real_relevant) | |
| 85 | 124 | |
| 86 | 125 | class F1(Metric): |
| 87 | 126 | """ |
| ... | ... | @@ -100,7 +139,10 @@ class F1(Metric): |
| 100 | 139 | """ |
| 101 | 140 | p = Precision().run(evaluation) |
| 102 | 141 | r = Recall().run(evaluation) |
| 103 | - return float((2*p*r))/(p+r) | |
| 142 | + if (p+r)>0: | |
| 143 | + return float((2*p*r))/(p+r) | |
| 144 | + else: | |
| 145 | + return 0 | |
| 104 | 146 | |
| 105 | 147 | class MAE(Metric): |
| 106 | 148 | """ |
| ... | ... | @@ -158,43 +200,47 @@ class Coverage(Metric): |
| 158 | 200 | Evaluation metric defined as the percentage of itens covered by the |
| 159 | 201 | recommender (have been recommended at least once). |
| 160 | 202 | """ |
| 161 | - def __init__(self,repository_size): | |
| 203 | + def __init__(self): | |
| 162 | 204 | """ |
| 163 | 205 | Set initial parameters. |
| 164 | 206 | """ |
| 165 | 207 | self.desc = " Coverage " |
| 166 | - self.repository_size = repository_size | |
| 167 | - self.covered = set() | |
| 168 | - | |
| 169 | - def save_covered(self,recommended_list): | |
| 170 | - """ | |
| 171 | - Register that a list of itens has been recommended. | |
| 172 | - """ | |
| 173 | - self.covered.update(set(recommended_list)) | |
| 174 | 208 | |
| 175 | - def run(self,evaluation): | |
| 209 | + def run(self,evaluations_set): | |
| 176 | 210 | """ |
| 177 | 211 | Compute metric. |
| 178 | 212 | """ |
| 179 | - return float(self.covered.size)/self.repository_size | |
| 213 | + covered = set() | |
| 214 | + for evaluation in evaluations_set: | |
| 215 | + covered.update(set(evaluation.predicted_relevant)) | |
| 216 | + return float(len(covered))/evaluation.repository_size | |
| 180 | 217 | |
| 181 | 218 | class Evaluation: |
| 182 | 219 | """ |
| 183 | 220 | Class designed to perform prediction evaluation, given data and metric. |
| 184 | 221 | """ |
| 185 | - def __init__(self,predicted_result,real_result): | |
| 222 | + def __init__(self,predicted,real,repository_size): | |
| 186 | 223 | """ |
| 187 | 224 | Set initial parameters. |
| 188 | 225 | """ |
| 189 | - self.predicted_item_scores = predicted_result.item_score | |
| 190 | - self.predicted_relevant = predicted_result.get_prediction() | |
| 191 | - self.real_item_scores = real_result.item_score | |
| 192 | - self.real_relevant = real_result.get_prediction() | |
| 193 | - self.predicted_real = [v for v in self.predicted_relevant if v in | |
| 194 | - self.real_relevant] | |
| 195 | - #print len(self.predicted_relevant) | |
| 196 | - #print len(self.real_relevant) | |
| 197 | - #print len(self.predicted_real) | |
| 226 | + self.repository_size = repository_size | |
| 227 | + self.predicted_item_scores = predicted.item_score | |
| 228 | + self.predicted_relevant = predicted.get_prediction() | |
| 229 | + self.real_item_scores = real.item_score | |
| 230 | + self.real_relevant = real.get_prediction() | |
| 231 | + | |
| 232 | + self.true_positive = [v[0] for v in self.predicted_relevant if v[0] in | |
| 233 | + [w[0] for w in self.real_relevant]] | |
| 234 | + self.false_positive = [v[0] for v in self.predicted_relevant if not v[0] in | |
| 235 | + [w[0] for w in self.real_relevant]] | |
| 236 | + self.false_negative = [v[0] for v in self.real_relevant if not v[0] in | |
| 237 | + [w[0] for w in self.predicted_relevant]] | |
| 238 | + | |
| 239 | + logging.debug("TP: %d" % len(self.true_positive)) | |
| 240 | + logging.debug("FP: %d" % len(self.false_positive)) | |
| 241 | + logging.debug("FN: %d" % len(self.false_negative)) | |
| 242 | + logging.debug("Repo_size: %d" % self.repository_size) | |
| 243 | + logging.debug("Relevant: %d" % len(self.real_relevant)) | |
| 198 | 244 | |
| 199 | 245 | def run(self,metric): |
| 200 | 246 | """ |
| ... | ... | @@ -206,7 +252,7 @@ class CrossValidation: |
| 206 | 252 | """ |
| 207 | 253 | Class designed to perform cross-validation process. |
| 208 | 254 | """ |
| 209 | - def __init__(self,partition_proportion,rounds,rec,metrics_list): | |
| 255 | + def __init__(self,partition_proportion,rounds,rec,metrics_list,result_proportion): | |
| 210 | 256 | """ |
| 211 | 257 | Set initial parameters. |
| 212 | 258 | """ |
| ... | ... | @@ -219,34 +265,13 @@ class CrossValidation: |
| 219 | 265 | self.recommender = rec |
| 220 | 266 | self.metrics_list = metrics_list |
| 221 | 267 | self.cross_results = defaultdict(list) |
| 222 | - | |
| 223 | - def __str__(self): | |
| 224 | - """ | |
| 225 | - String representation of the object. | |
| 226 | - """ | |
| 227 | - str = "\n" | |
| 228 | - metrics_desc = "" | |
| 229 | - for metric in self.metrics_list: | |
| 230 | - metrics_desc += "%s|" % (metric.desc) | |
| 231 | - str += "| Round |%s\n" % metrics_desc | |
| 232 | - for r in range(self.rounds): | |
| 233 | - metrics_result = "" | |
| 234 | - for metric in self.metrics_list: | |
| 235 | - metrics_result += (" %2.1f%% |" % | |
| 236 | - (self.cross_results[metric.desc][r]*100)) | |
| 237 | - str += "| %d |%s\n" % (r,metrics_result) | |
| 238 | - metrics_mean = "" | |
| 239 | - for metric in self.metrics_list: | |
| 240 | - mean = float(sum(self.cross_results[metric.desc]) / | |
| 241 | - len(self.cross_results[metric.desc])) | |
| 242 | - metrics_mean += " %2.1f%% |" % (mean*100) | |
| 243 | - str += "| Mean |%s\n" % (metrics_mean) | |
| 244 | - return str | |
| 268 | + self.result_proportion = result_proportion | |
| 245 | 269 | |
| 246 | 270 | def run(self,user): |
| 247 | 271 | """ |
| 248 | 272 | Perform cross-validation. |
| 249 | 273 | """ |
| 274 | + # | |
| 250 | 275 | cross_item_score = dict.fromkeys(user.pkg_profile,1) |
| 251 | 276 | partition_size = int(len(cross_item_score)*self.partition_proportion) |
| 252 | 277 | for r in range(self.rounds): |
| ... | ... | @@ -258,10 +283,17 @@ class CrossValidation: |
| 258 | 283 | logging.critical("Empty cross_item_score.") |
| 259 | 284 | raise Error |
| 260 | 285 | round_partition[random_key] = cross_item_score.pop(random_key) |
| 286 | + #logging.debug("Round partition: %s",str(round_partition)) | |
| 287 | + #logging.debug("Cross item-score: %s",str(cross_item_score)) | |
| 261 | 288 | round_user = User(cross_item_score) |
| 262 | - predicted_result = self.recommender.get_recommendation(round_user) | |
| 263 | - real_result = RecommendationResult(round_partition,len(round_partition)) | |
| 264 | - evaluation = Evaluation(predicted_result,real_result) | |
| 289 | + result_size = int(self.recommender.items_repository.get_doccount()* | |
| 290 | + self.result_proportion) | |
| 291 | + predicted_result = self.recommender.get_recommendation(round_user,result_size) | |
| 292 | + print len(round_partition) | |
| 293 | + real_result = RecommendationResult(round_partition) | |
| 294 | + #logging.debug("Predicted result: %s",predicted_result) | |
| 295 | + evaluation = Evaluation(predicted_result,real_result, | |
| 296 | + self.recommender.items_repository.get_doccount()) | |
| 265 | 297 | for metric in self.metrics_list: |
| 266 | 298 | result = evaluation.run(metric) |
| 267 | 299 | self.cross_results[metric.desc].append(result) |
| ... | ... | @@ -269,3 +301,26 @@ class CrossValidation: |
| 269 | 301 | item,score = round_partition.popitem() |
| 270 | 302 | cross_item_score[item] = score |
| 271 | 303 | |
| 304 | + def __str__(self): | |
| 305 | + """ | |
| 306 | + String representation of the object. | |
| 307 | + """ | |
| 308 | + str = "\n" | |
| 309 | + metrics_desc = "" | |
| 310 | + for metric in self.metrics_list: | |
| 311 | + metrics_desc += "%s|" % (metric.desc) | |
| 312 | + str += "| Round |%s\n" % metrics_desc | |
| 313 | + for r in range(self.rounds): | |
| 314 | + metrics_result = "" | |
| 315 | + for metric in self.metrics_list: | |
| 316 | + metrics_result += (" %2.1f%% |" % | |
| 317 | + (self.cross_results[metric.desc][r]*100)) | |
| 318 | + str += "| %d |%s\n" % (r,metrics_result) | |
| 319 | + metrics_mean = "" | |
| 320 | + for metric in self.metrics_list: | |
| 321 | + mean = float(sum(self.cross_results[metric.desc]) / | |
| 322 | + len(self.cross_results[metric.desc])) | |
| 323 | + metrics_mean += " %2.1f%% |" % (mean*100) | |
| 324 | + str += "| Mean |%s\n" % (metrics_mean) | |
| 325 | + return str | |
| 326 | + | ... | ... |
src/recommender.py
| ... | ... | @@ -45,13 +45,15 @@ class RecommendationResult: |
| 45 | 45 | str += "%2d: %s\n" % (i,result[i][0]) |
| 46 | 46 | return str |
| 47 | 47 | |
| 48 | - def get_prediction(self,limit=20): | |
| 48 | + def get_prediction(self,limit=0): | |
| 49 | 49 | """ |
| 50 | 50 | Return prediction based on recommendation size (number of items). |
| 51 | 51 | """ |
| 52 | - if limit > self.size: limit = self.size | |
| 53 | 52 | sorted_result = sorted(self.item_score.items(), |
| 54 | 53 | key=operator.itemgetter(1)) |
| 54 | + if not limit or limit > self.size: | |
| 55 | + limit = self.size | |
| 56 | + | |
| 55 | 57 | return list(reversed(sorted_result[-limit:])) |
| 56 | 58 | |
| 57 | 59 | class Recommender: |
| ... | ... | @@ -63,13 +65,12 @@ class Recommender: |
| 63 | 65 | Set initial parameters. |
| 64 | 66 | """ |
| 65 | 67 | self.items_repository = xapian.Database(cfg.axi) |
| 66 | - self.users_repository = data.PopconXapianIndex(cfg) | |
| 67 | - #self.clustered_users_repository = data.PopconXapianIndex(cfg) | |
| 68 | 68 | self.set_strategy(cfg.strategy) |
| 69 | 69 | if cfg.weight == "bm25": |
| 70 | 70 | self.weight = xapian.BM25Weight() |
| 71 | 71 | else: |
| 72 | 72 | self.weight = xapian.TradWeight() |
| 73 | + self.cfg = cfg | |
| 73 | 74 | |
| 74 | 75 | def set_strategy(self,strategy_str): |
| 75 | 76 | """ |
| ... | ... | @@ -83,6 +84,7 @@ class Recommender: |
| 83 | 84 | self.strategy = strategy.ContentBasedStrategy("desc") |
| 84 | 85 | if strategy_str == "col": |
| 85 | 86 | self.strategy = strategy.CollaborativeStrategy(20) |
| 87 | + self.users_repository = data.PopconXapianIndex(self.cfg) | |
| 86 | 88 | |
| 87 | 89 | def get_recommendation(self,user,result_size=20): |
| 88 | 90 | """ | ... | ... |
src/tests/data_tests.py
| ... | ... | @@ -71,13 +71,13 @@ class PopconXapianIndexTests(unittest2.TestCase): |
| 71 | 71 | |
| 72 | 72 | def test_reindex(self): |
| 73 | 73 | # force reindex with no clustering |
| 74 | - self.cfg.index_mode = "10" | |
| 74 | + self.cfg.index_mode = "reindex" | |
| 75 | 75 | pxi = PopconXapianIndex(self.cfg) |
| 76 | 76 | self.assertEqual(pxi.get_metadata("old"),"") |
| 77 | 77 | |
| 78 | 78 | def test_clustering(self): |
| 79 | 79 | # force reindex with clustering |
| 80 | - self.cfg.index_mode = "11" | |
| 80 | + self.cfg.index_mode = "cluster" | |
| 81 | 81 | pxi = PopconXapianIndex(self.cfg) |
| 82 | 82 | self.assertEqual(pxi.source_dir,self.cfg.clusters_dir) |
| 83 | 83 | all_submissions = [submissions for (root, dirs, submissions) in |
| ... | ... | @@ -95,6 +95,13 @@ class PopconXapianIndexTests(unittest2.TestCase): |
| 95 | 95 | sum([len(submissions) for submissions in |
| 96 | 96 | all_submissions])) |
| 97 | 97 | |
| 98 | + def test_recluster(self): | |
| 99 | + # force reindexing and clustering | |
| 100 | + self.cfg.index_mode = "recluster" | |
| 101 | + self.cfg.k_medoids = 2 | |
| 102 | + pxi = PopconXapianIndex(self.cfg) | |
| 103 | + self.assertEqual(pxi.source_dir,self.cfg.clusters_dir) | |
| 104 | + self.assertEqual(pxi.get_doccount(),2) | |
| 98 | 105 | |
| 99 | 106 | if __name__ == '__main__': |
| 100 | 107 | unittest2.main() | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_0
| 1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_0 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
| 2 | +1309407475 1303670994 gimp /usr/bin/perl | |
| 3 | +1309407451 1303670982 inkscape /lib/i686/cmov/libc-2.11.2.so | |
| 4 | +1309407450 1303670973 imagination /lib/ld-2.11.2.so | |
| 5 | 5 | 1309407434 1295654294 dash /bin/dash |
| 6 | 6 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_1
| 1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_1 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
| 2 | +1309407475 1303670994 gimp /usr/bin/perl | |
| 5 | 3 | 1309407434 1295654294 dash /bin/dash |
| 6 | 4 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | 5 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_2
| 1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_2 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
| 2 | +1309407475 1303670994 iceweasel /usr/bin/perl | |
| 3 | +1309407451 1303670982 python /lib/i686/cmov/libc-2.11.2.so | |
| 4 | 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so |
| 5 | 5 | 1309407434 1295654294 dash /bin/dash |
| 6 | 6 | 0 0 libusbmuxd1 <NOFILES> | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_3
| 1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_3 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
| 2 | +1309407475 1303670994 eog /usr/bin/perl | |
| 3 | +1309407451 1303670982 nautilus /lib/i686/cmov/libc-2.11.2.so | |
| 4 | 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so |
| 5 | -1309407434 1295654294 dash /bin/dash | |
| 6 | 5 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | 6 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_4
| 1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_4 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
| 2 | +1309407475 1303670994 konqueror /usr/bin/perl | |
| 3 | +1309407451 1303670982 kedit /lib/i686/cmov/libc-2.11.2.so | |
| 4 | 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so |
| 5 | -1309407434 1295654294 dash /bin/dash | |
| 6 | 5 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | 6 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_5
| 1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_5 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
| 2 | +1309407475 1303670994 konqueror /usr/bin/perl | |
| 5 | 3 | 1309407434 1295654294 dash /bin/dash |
| 6 | 4 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | 5 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_6
| 1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_6 ARCH:i386 POPCONVER:1.52 |
| 2 | 2 | 1309407475 1303670994 perl-base /usr/bin/perl |
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
| 3 | +1309407451 1303670982 eog /lib/i686/cmov/libc-2.11.2.so | |
| 4 | +1309407450 1303670973 nautilus /lib/ld-2.11.2.so | |
| 5 | 5 | 1309407434 1295654294 dash /bin/dash |
| 6 | 6 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_7
| 1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_7 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
| 2 | +1309407475 1303670994 apticron /usr/bin/perl | |
| 3 | +1309407451 1303670982 aptitude /lib/i686/cmov/libc-2.11.2.so | |
| 4 | +1309407450 1303670973 apt /lib/ld-2.11.2.so | |
| 5 | 5 | 1309407434 1295654294 dash /bin/dash |
| 6 | 6 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/tests/test_data/popcon_dir/test_popcon_8
| 1 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_8 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | |
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | |
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | |
| 2 | +1309407475 1303670994 apticron /usr/bin/perl | |
| 3 | +1309407451 1303670982 eog /lib/i686/cmov/libc-2.11.2.so | |
| 4 | +1309407450 1303670973 nautilus /lib/ld-2.11.2.so | |
| 5 | 5 | 1309407434 1295654294 dash /bin/dash |
| 6 | 6 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | ... | ... |
src/user.py
| ... | ... | @@ -152,6 +152,24 @@ class User: |
| 152 | 152 | desc_profile = self.desc_profile(items_repository,size)[:size/2] |
| 153 | 153 | return tag_profile+desc_profile |
| 154 | 154 | |
| 155 | + def app_pkg_profile(self,axi): | |
| 156 | + """ | |
| 157 | + Return list of packages that are applications. | |
| 158 | + """ | |
| 159 | + old_profile_size = len(self.pkg_profile) | |
| 160 | + for p in self.pkg_profile[:]: #iterate list copy | |
| 161 | + tags = data.axi_search_pkg_tags(axi,p) | |
| 162 | + try: | |
| 163 | + | |
| 164 | + if not "XTrole::program" in tags: | |
| 165 | + self.pkg_profile.remove(p) | |
| 166 | + except: | |
| 167 | + logging.debug("Package not found in axi: %s" % p) | |
| 168 | + profile_size = len(self.pkg_profile) | |
| 169 | + logging.debug("App package profile: reduced packages profile size \ | |
| 170 | + from %d to %d." % (old_profile_size, profile_size)) | |
| 171 | + return self.pkg_profile | |
| 172 | + | |
| 155 | 173 | def maximal_pkg_profile(self): |
| 156 | 174 | """ |
| 157 | 175 | Return list of packages that are not dependence of any other package in | ... | ... |