Commit 65be4b76c9e779b7f600b211e41649b0310b3eaa
Exists in
master
and in
1 other branch
Merge remote branch 'upstream/master'
Conflicts: src/data.py
Showing
15 changed files
with
198 additions
and
122 deletions
Show diff stats
src/config.py
| @@ -44,7 +44,8 @@ class Config(): | @@ -44,7 +44,8 @@ class Config(): | ||
| 44 | self.popcon_index = os.path.expanduser("~/.app-recommender/popcon_index") | 44 | self.popcon_index = os.path.expanduser("~/.app-recommender/popcon_index") |
| 45 | self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir") | 45 | self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir") |
| 46 | self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir") | 46 | self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir") |
| 47 | - self.index_mode = "0" # use old index | 47 | + self.k_medoids = 100 |
| 48 | + self.index_mode = "old" | ||
| 48 | self.strategy = "cb" | 49 | self.strategy = "cb" |
| 49 | self.weight = "bm25" | 50 | self.weight = "bm25" |
| 50 | self.load_options() | 51 | self.load_options() |
| @@ -65,8 +66,9 @@ class Config(): | @@ -65,8 +66,9 @@ class Config(): | ||
| 65 | print " -a, --axi=PATH Path to Apt-xapian-index" | 66 | print " -a, --axi=PATH Path to Apt-xapian-index" |
| 66 | print " -p, --popconindex=PATH Path to popcon dedicated index" | 67 | print " -p, --popconindex=PATH Path to popcon dedicated index" |
| 67 | print " -m, --popcondir=PATH Path to popcon submissions dir" | 68 | print " -m, --popcondir=PATH Path to popcon submissions dir" |
| 68 | - print " -u, --index_mode= 0: old, 1:reindex, 11:clustered_index" | 69 | + print " -u, --indexmode= old, reindex, cluster, recluster" |
| 69 | print " -l, --clustersdir=PATH Path to popcon clusters dir" | 70 | print " -l, --clustersdir=PATH Path to popcon clusters dir" |
| 71 | + print " -e, --medoids=k Number of medoids for clustering" | ||
| 70 | print " -w, --weight=OPTION Search weighting scheme" | 72 | print " -w, --weight=OPTION Search weighting scheme" |
| 71 | print " -s, --strategy=OPTION Recommendation strategy" | 73 | print " -s, --strategy=OPTION Recommendation strategy" |
| 72 | print "" | 74 | print "" |
| @@ -115,13 +117,14 @@ class Config(): | @@ -115,13 +117,14 @@ class Config(): | ||
| 115 | self.popcon_dir = self.read_option('recommender', 'popcon_dir') | 117 | self.popcon_dir = self.read_option('recommender', 'popcon_dir') |
| 116 | self.index_mode = self.read_option('recommender', 'index_mode') | 118 | self.index_mode = self.read_option('recommender', 'index_mode') |
| 117 | self.clusters_dir = self.read_option('recommender', 'clusters_dir') | 119 | self.clusters_dir = self.read_option('recommender', 'clusters_dir') |
| 120 | + self.k_medoids = self.read_option('recommender', 'k_medoids') | ||
| 118 | self.weight = self.read_option('recommender', 'weight') | 121 | self.weight = self.read_option('recommender', 'weight') |
| 119 | self.strategy = self.read_option('recommender', 'strategy') | 122 | self.strategy = self.read_option('recommender', 'strategy') |
| 120 | 123 | ||
| 121 | - short_options = "hdvo:c:a:p:m:ul:w:s:" | 124 | + short_options = "hdvo:c:a:p:m:ul:e:w:s:" |
| 122 | long_options = ["help", "debug", "verbose", "output=", "config=", | 125 | long_options = ["help", "debug", "verbose", "output=", "config=", |
| 123 | - "axi=", "popconindex=", "popcondir=", "index_mode=", | ||
| 124 | - "clusters_dir=", "weight=", "strategy="] | 126 | + "axi=", "popconindex=", "popcondir=", "indexmode=", |
| 127 | + "clustersdir=", "kmedoids=", "weight=", "strategy="] | ||
| 125 | try: | 128 | try: |
| 126 | opts, args = getopt.getopt(sys.argv[1:], short_options, | 129 | opts, args = getopt.getopt(sys.argv[1:], short_options, |
| 127 | long_options) | 130 | long_options) |
| @@ -154,6 +157,8 @@ class Config(): | @@ -154,6 +157,8 @@ class Config(): | ||
| 154 | self.index_mode = p | 157 | self.index_mode = p |
| 155 | elif o in ("-l", "--clustersdir"): | 158 | elif o in ("-l", "--clustersdir"): |
| 156 | self.clusters_dir = p | 159 | self.clusters_dir = p |
| 160 | + elif o in ("-e", "--kmedoids"): | ||
| 161 | + self.k_medoids = p | ||
| 157 | elif o in ("-w", "--weight"): | 162 | elif o in ("-w", "--weight"): |
| 158 | self.weight = p | 163 | self.weight = p |
| 159 | elif o in ("-s", "--strategy"): | 164 | elif o in ("-s", "--strategy"): |
src/data.py
| @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
| 129 | """ | 129 | """ |
| 130 | self.axi = xapian.Database(cfg.axi) | 130 | self.axi = xapian.Database(cfg.axi) |
| 131 | self.path = os.path.expanduser(cfg.popcon_index) | 131 | self.path = os.path.expanduser(cfg.popcon_index) |
| 132 | - if cfg.index_mode.startswith("1") or not self.load_index(): | 132 | + if not cfg.index_mode == "old" or not self.load_index(): |
| 133 | if not os.path.exists(cfg.popcon_dir): | 133 | if not os.path.exists(cfg.popcon_dir): |
| 134 | os.makedirs(cfg.popcon_dir) | 134 | os.makedirs(cfg.popcon_dir) |
| 135 | if not os.listdir(cfg.popcon_dir): | 135 | if not os.listdir(cfg.popcon_dir): |
| 136 | logging.critical("Popcon dir seems to be empty.") | 136 | logging.critical("Popcon dir seems to be empty.") |
| 137 | raise Error | 137 | raise Error |
| 138 | - if cfg.index_mode == "10": | 138 | + if cfg.index_mode == "reindex": |
| 139 | self.source_dir = os.path.expanduser(cfg.popcon_dir) | 139 | self.source_dir = os.path.expanduser(cfg.popcon_dir) |
| 140 | else: | 140 | else: |
| 141 | self.source_dir = os.path.expanduser(cfg.clusters_dir) | 141 | self.source_dir = os.path.expanduser(cfg.clusters_dir) |
| 142 | if not os.path.exists(cfg.clusters_dir): | 142 | if not os.path.exists(cfg.clusters_dir): |
| 143 | os.makedirs(cfg.clusters_dir) | 143 | os.makedirs(cfg.clusters_dir) |
| 144 | - if not os.listdir(cfg.clusters_dir): | ||
| 145 | - distance = JaccardDistance() | 144 | + if not os.listdir(cfg.clusters_dir) or \ |
| 145 | + cfg.index_mode == "recluster": | ||
| 146 | + shutil.rmtree(cfg.clusters_dir,1) | ||
| 147 | + os.makedirs(cfg.clusters_dir) | ||
| 146 | logging.info("Clustering popcon submissions from \'%s\'" | 148 | logging.info("Clustering popcon submissions from \'%s\'" |
| 147 | % cfg.popcon_dir) | 149 | % cfg.popcon_dir) |
| 148 | logging.info("Clusters will be placed at \'%s\'" | 150 | logging.info("Clusters will be placed at \'%s\'" |
| 149 | % cfg.clusters_dir) | 151 | % cfg.clusters_dir) |
| 152 | + distance = JaccardDistance() | ||
| 150 | data = self.get_submissions(cfg.popcon_dir) | 153 | data = self.get_submissions(cfg.popcon_dir) |
| 151 | - if cfg.clustering == "Hierarchical": | ||
| 152 | - self.hierarchical_clustering(data,cfg.clusters_dir, | ||
| 153 | - distance) | ||
| 154 | - else: | ||
| 155 | - self.kmedoids_clustering(data,cfg.clusters_dir, | ||
| 156 | - distance) | 154 | + self.cluster_dispersion = \ |
| 155 | + self.kmedoids_clustering(data, cfg.clusters_dir, | ||
| 156 | + distance, cfg.k_medoids) | ||
| 157 | + logging.info("Clusters dispersion: %f.2", | ||
| 158 | + self.cluster_dispersion) | ||
| 159 | + else: | ||
| 160 | + logging.info("Using clusters from \'%s\'" % | ||
| 161 | + cfg.clusters_dir) | ||
| 157 | self.build_index() | 162 | self.build_index() |
| 158 | 163 | ||
| 159 | def __str__(self): | 164 | def __str__(self): |
| @@ -167,10 +172,9 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -167,10 +172,9 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
| 167 | logging.info("Opening existing popcon xapian index at \'%s\'" | 172 | logging.info("Opening existing popcon xapian index at \'%s\'" |
| 168 | % self.path) | 173 | % self.path) |
| 169 | xapian.Database.__init__(self,self.path) | 174 | xapian.Database.__init__(self,self.path) |
| 170 | - return True | 175 | + return 1 |
| 171 | except xapian.DatabaseError: | 176 | except xapian.DatabaseError: |
| 172 | logging.info("Could not open popcon index.") | 177 | logging.info("Could not open popcon index.") |
| 173 | - return True | ||
| 174 | return 0 | 178 | return 0 |
| 175 | 179 | ||
| 176 | def build_index(self): | 180 | def build_index(self): |
| @@ -224,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -224,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
| 224 | submissions.append(submission) | 228 | submissions.append(submission) |
| 225 | return submissions | 229 | return submissions |
| 226 | 230 | ||
| 227 | - def hierarchical_clustering(self,data,clusters_dir,distance,k=10): | ||
| 228 | - """ | ||
| 229 | - Select popcon submissions from popcon_dir and place them at clusters_dir | ||
| 230 | - """ | ||
| 231 | - cl = cluster.HierarchicalClustering(data, lambda x,y: | ||
| 232 | - distance(x.packages.keys(), | ||
| 233 | - y.packages.keys())) | ||
| 234 | - clusters = cl.getlevel(0.5) | ||
| 235 | - for c in clusters: | ||
| 236 | - print "cluster" | ||
| 237 | - for submission in c: | ||
| 238 | - print submission.user_id | ||
| 239 | - | ||
| 240 | - def kmedoids_clustering(self,data,clusters_dir,distance,k=10): | 231 | + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): |
| 241 | clusters = KMedoidsClustering(data,lambda x,y: | 232 | clusters = KMedoidsClustering(data,lambda x,y: |
| 242 | distance(x.packages.keys(), | 233 | distance(x.packages.keys(), |
| 243 | y.packages.keys())) | 234 | y.packages.keys())) |
| 244 | - medoids = clusters.getMedoids(2) | 235 | + medoids,dispersion = clusters.getMedoids(k_medoids) |
| 245 | for submission in medoids: | 236 | for submission in medoids: |
| 246 | shutil.copyfile(submission.path,os.path.join(clusters_dir, | 237 | shutil.copyfile(submission.path,os.path.join(clusters_dir, |
| 247 | submission.user_id)) | 238 | submission.user_id)) |
| 239 | + return dispersion | ||
| 248 | 240 | ||
| 249 | class KMedoidsClustering(cluster.KMeansClustering): | 241 | class KMedoidsClustering(cluster.KMeansClustering): |
| 250 | 242 | ||
| 251 | - def __init__(self,data,distance): | ||
| 252 | - if len(data)<100: | 243 | + def __init__(self,data,distance,max_data=100): |
| 244 | + if len(data)<max_data: | ||
| 253 | data_sample = data | 245 | data_sample = data |
| 254 | else: | 246 | else: |
| 255 | - data_sample = random.sample(data,100) | 247 | + data_sample = random.sample(data,max_data) |
| 256 | cluster.KMeansClustering.__init__(self, data_sample, distance) | 248 | cluster.KMeansClustering.__init__(self, data_sample, distance) |
| 257 | self.distanceMatrix = {} | 249 | self.distanceMatrix = {} |
| 258 | for submission in self._KMeansClustering__data: | 250 | for submission in self._KMeansClustering__data: |
| @@ -287,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): | @@ -287,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): | ||
| 287 | logging.debug("medoidDistance: %f" % medoidDistance) | 279 | logging.debug("medoidDistance: %f" % medoidDistance) |
| 288 | logging.debug("Cluster medoid: [%d] %s" % (medoid, | 280 | logging.debug("Cluster medoid: [%d] %s" % (medoid, |
| 289 | cluster[medoid].user_id)) | 281 | cluster[medoid].user_id)) |
| 290 | - return cluster[medoid] | 282 | + return (cluster[medoid],medoidDistance) |
| 291 | 283 | ||
| 292 | def assign_item(self, item, origin): | 284 | def assign_item(self, item, origin): |
| 293 | """ | 285 | """ |
| @@ -295,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | @@ -295,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | ||
| 295 | """ | 287 | """ |
| 296 | closest_cluster = origin | 288 | closest_cluster = origin |
| 297 | for cluster in self._KMeansClustering__clusters: | 289 | for cluster in self._KMeansClustering__clusters: |
| 298 | - if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)): | 290 | + if self.distance(item,self.getMedoid(cluster)[0]) < \ |
| 291 | + self.distance(item,self.getMedoid(closest_cluster)[0]): | ||
| 299 | closest_cluster = cluster | 292 | closest_cluster = cluster |
| 300 | 293 | ||
| 301 | if closest_cluster != origin: | 294 | if closest_cluster != origin: |
| @@ -309,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | @@ -309,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | ||
| 309 | """ | 302 | """ |
| 310 | Generate n clusters and return their medoids. | 303 | Generate n clusters and return their medoids. |
| 311 | """ | 304 | """ |
| 312 | - medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | ||
| 313 | - logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids]) | ||
| 314 | - return medoids | 305 | + medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)] |
| 306 | + medoids = [m[0] for m in medoids_distances] | ||
| 307 | + dispersion = sum([m[1] for m in medoids_distances]) | ||
| 308 | + logging.info("Clustering completed and the following medoids were found: %s" % [c.user_id for c in medoids]) | ||
| 309 | + return medoids,dispersion |
src/evaluation.py
| @@ -49,6 +49,45 @@ class Metric(Singleton): | @@ -49,6 +49,45 @@ class Metric(Singleton): | ||
| 49 | evaluation.real_item_scores[k])) | 49 | evaluation.real_item_scores[k])) |
| 50 | return errors | 50 | return errors |
| 51 | 51 | ||
| 52 | + | ||
| 53 | +class SimpleAccuracy(Metric): | ||
| 54 | + """ | ||
| 55 | + Classification accuracy metric which consider classes sizes. | ||
| 56 | + """ | ||
| 57 | + def __init__(self): | ||
| 58 | + """ | ||
| 59 | + Set metric description. | ||
| 60 | + """ | ||
| 61 | + self.desc = " S_Accuracy " | ||
| 62 | + | ||
| 63 | + def run(self,evaluation): | ||
| 64 | + """ | ||
| 65 | + Compute metric. | ||
| 66 | + """ | ||
| 67 | + return float((evaluation.repository_size- | ||
| 68 | + len(evaluation.false_positive))- | ||
| 69 | + len(evaluation.false_negative))/evaluation.repository_size | ||
| 70 | + | ||
| 71 | +class Accuracy(Metric): | ||
| 72 | + """ | ||
| 73 | + Classification accuracy metric which consider classes sizes. | ||
| 74 | + """ | ||
| 75 | + def __init__(self): | ||
| 76 | + """ | ||
| 77 | + Set metric description. | ||
| 78 | + """ | ||
| 79 | + self.desc = " Accuracy " | ||
| 80 | + | ||
| 81 | + def run(self,evaluation): | ||
| 82 | + """ | ||
| 83 | + Compute metric. | ||
| 84 | + """ | ||
| 85 | + error_1 = (float(len(evaluation.false_positive))/ | ||
| 86 | + (evaluation.repository_size-len(evaluation.real_relevant))) | ||
| 87 | + error_2 = (float(len(evaluation.false_negative))/len(evaluation.real_relevant)) | ||
| 88 | + accuracy = 1-(float(error_1+error_2)/2) | ||
| 89 | + return accuracy | ||
| 90 | + | ||
| 52 | class Precision(Metric): | 91 | class Precision(Metric): |
| 53 | """ | 92 | """ |
| 54 | Classification accuracy metric defined as the percentage of relevant itens | 93 | Classification accuracy metric defined as the percentage of relevant itens |
| @@ -64,7 +103,7 @@ class Precision(Metric): | @@ -64,7 +103,7 @@ class Precision(Metric): | ||
| 64 | """ | 103 | """ |
| 65 | Compute metric. | 104 | Compute metric. |
| 66 | """ | 105 | """ |
| 67 | - return float(len(evaluation.predicted_real))/len(evaluation.predicted_relevant) | 106 | + return float(len(evaluation.true_positive))/len(evaluation.predicted_relevant) |
| 68 | 107 | ||
| 69 | class Recall(Metric): | 108 | class Recall(Metric): |
| 70 | """ | 109 | """ |
| @@ -81,7 +120,7 @@ class Recall(Metric): | @@ -81,7 +120,7 @@ class Recall(Metric): | ||
| 81 | """ | 120 | """ |
| 82 | Compute metric. | 121 | Compute metric. |
| 83 | """ | 122 | """ |
| 84 | - return float(len(evaluation.predicted_real))/len(evaluation.real_relevant) | 123 | + return float(len(evaluation.true_positive))/len(evaluation.real_relevant) |
| 85 | 124 | ||
| 86 | class F1(Metric): | 125 | class F1(Metric): |
| 87 | """ | 126 | """ |
| @@ -100,7 +139,10 @@ class F1(Metric): | @@ -100,7 +139,10 @@ class F1(Metric): | ||
| 100 | """ | 139 | """ |
| 101 | p = Precision().run(evaluation) | 140 | p = Precision().run(evaluation) |
| 102 | r = Recall().run(evaluation) | 141 | r = Recall().run(evaluation) |
| 103 | - return float((2*p*r))/(p+r) | 142 | + if (p+r)>0: |
| 143 | + return float((2*p*r))/(p+r) | ||
| 144 | + else: | ||
| 145 | + return 0 | ||
| 104 | 146 | ||
| 105 | class MAE(Metric): | 147 | class MAE(Metric): |
| 106 | """ | 148 | """ |
| @@ -158,43 +200,47 @@ class Coverage(Metric): | @@ -158,43 +200,47 @@ class Coverage(Metric): | ||
| 158 | Evaluation metric defined as the percentage of itens covered by the | 200 | Evaluation metric defined as the percentage of itens covered by the |
| 159 | recommender (have been recommended at least once). | 201 | recommender (have been recommended at least once). |
| 160 | """ | 202 | """ |
| 161 | - def __init__(self,repository_size): | 203 | + def __init__(self): |
| 162 | """ | 204 | """ |
| 163 | Set initial parameters. | 205 | Set initial parameters. |
| 164 | """ | 206 | """ |
| 165 | self.desc = " Coverage " | 207 | self.desc = " Coverage " |
| 166 | - self.repository_size = repository_size | ||
| 167 | - self.covered = set() | ||
| 168 | - | ||
| 169 | - def save_covered(self,recommended_list): | ||
| 170 | - """ | ||
| 171 | - Register that a list of itens has been recommended. | ||
| 172 | - """ | ||
| 173 | - self.covered.update(set(recommended_list)) | ||
| 174 | 208 | ||
| 175 | - def run(self,evaluation): | 209 | + def run(self,evaluations_set): |
| 176 | """ | 210 | """ |
| 177 | Compute metric. | 211 | Compute metric. |
| 178 | """ | 212 | """ |
| 179 | - return float(self.covered.size)/self.repository_size | 213 | + covered = set() |
| 214 | + for evaluation in evaluations_set: | ||
| 215 | + covered.update(set(evaluation.predicted_relevant)) | ||
| 216 | + return float(len(covered))/evaluation.repository_size | ||
| 180 | 217 | ||
| 181 | class Evaluation: | 218 | class Evaluation: |
| 182 | """ | 219 | """ |
| 183 | Class designed to perform prediction evaluation, given data and metric. | 220 | Class designed to perform prediction evaluation, given data and metric. |
| 184 | """ | 221 | """ |
| 185 | - def __init__(self,predicted_result,real_result): | 222 | + def __init__(self,predicted,real,repository_size): |
| 186 | """ | 223 | """ |
| 187 | Set initial parameters. | 224 | Set initial parameters. |
| 188 | """ | 225 | """ |
| 189 | - self.predicted_item_scores = predicted_result.item_score | ||
| 190 | - self.predicted_relevant = predicted_result.get_prediction() | ||
| 191 | - self.real_item_scores = real_result.item_score | ||
| 192 | - self.real_relevant = real_result.get_prediction() | ||
| 193 | - self.predicted_real = [v for v in self.predicted_relevant if v in | ||
| 194 | - self.real_relevant] | ||
| 195 | - #print len(self.predicted_relevant) | ||
| 196 | - #print len(self.real_relevant) | ||
| 197 | - #print len(self.predicted_real) | 226 | + self.repository_size = repository_size |
| 227 | + self.predicted_item_scores = predicted.item_score | ||
| 228 | + self.predicted_relevant = predicted.get_prediction() | ||
| 229 | + self.real_item_scores = real.item_score | ||
| 230 | + self.real_relevant = real.get_prediction() | ||
| 231 | + | ||
| 232 | + self.true_positive = [v[0] for v in self.predicted_relevant if v[0] in | ||
| 233 | + [w[0] for w in self.real_relevant]] | ||
| 234 | + self.false_positive = [v[0] for v in self.predicted_relevant if not v[0] in | ||
| 235 | + [w[0] for w in self.real_relevant]] | ||
| 236 | + self.false_negative = [v[0] for v in self.real_relevant if not v[0] in | ||
| 237 | + [w[0] for w in self.predicted_relevant]] | ||
| 238 | + | ||
| 239 | + logging.debug("TP: %d" % len(self.true_positive)) | ||
| 240 | + logging.debug("FP: %d" % len(self.false_positive)) | ||
| 241 | + logging.debug("FN: %d" % len(self.false_negative)) | ||
| 242 | + logging.debug("Repo_size: %d" % self.repository_size) | ||
| 243 | + logging.debug("Relevant: %d" % len(self.real_relevant)) | ||
| 198 | 244 | ||
| 199 | def run(self,metric): | 245 | def run(self,metric): |
| 200 | """ | 246 | """ |
| @@ -206,7 +252,7 @@ class CrossValidation: | @@ -206,7 +252,7 @@ class CrossValidation: | ||
| 206 | """ | 252 | """ |
| 207 | Class designed to perform cross-validation process. | 253 | Class designed to perform cross-validation process. |
| 208 | """ | 254 | """ |
| 209 | - def __init__(self,partition_proportion,rounds,rec,metrics_list): | 255 | + def __init__(self,partition_proportion,rounds,rec,metrics_list,result_proportion): |
| 210 | """ | 256 | """ |
| 211 | Set initial parameters. | 257 | Set initial parameters. |
| 212 | """ | 258 | """ |
| @@ -219,34 +265,13 @@ class CrossValidation: | @@ -219,34 +265,13 @@ class CrossValidation: | ||
| 219 | self.recommender = rec | 265 | self.recommender = rec |
| 220 | self.metrics_list = metrics_list | 266 | self.metrics_list = metrics_list |
| 221 | self.cross_results = defaultdict(list) | 267 | self.cross_results = defaultdict(list) |
| 222 | - | ||
| 223 | - def __str__(self): | ||
| 224 | - """ | ||
| 225 | - String representation of the object. | ||
| 226 | - """ | ||
| 227 | - str = "\n" | ||
| 228 | - metrics_desc = "" | ||
| 229 | - for metric in self.metrics_list: | ||
| 230 | - metrics_desc += "%s|" % (metric.desc) | ||
| 231 | - str += "| Round |%s\n" % metrics_desc | ||
| 232 | - for r in range(self.rounds): | ||
| 233 | - metrics_result = "" | ||
| 234 | - for metric in self.metrics_list: | ||
| 235 | - metrics_result += (" %2.1f%% |" % | ||
| 236 | - (self.cross_results[metric.desc][r]*100)) | ||
| 237 | - str += "| %d |%s\n" % (r,metrics_result) | ||
| 238 | - metrics_mean = "" | ||
| 239 | - for metric in self.metrics_list: | ||
| 240 | - mean = float(sum(self.cross_results[metric.desc]) / | ||
| 241 | - len(self.cross_results[metric.desc])) | ||
| 242 | - metrics_mean += " %2.1f%% |" % (mean*100) | ||
| 243 | - str += "| Mean |%s\n" % (metrics_mean) | ||
| 244 | - return str | 268 | + self.result_proportion = result_proportion |
| 245 | 269 | ||
| 246 | def run(self,user): | 270 | def run(self,user): |
| 247 | """ | 271 | """ |
| 248 | Perform cross-validation. | 272 | Perform cross-validation. |
| 249 | """ | 273 | """ |
| 274 | + # | ||
| 250 | cross_item_score = dict.fromkeys(user.pkg_profile,1) | 275 | cross_item_score = dict.fromkeys(user.pkg_profile,1) |
| 251 | partition_size = int(len(cross_item_score)*self.partition_proportion) | 276 | partition_size = int(len(cross_item_score)*self.partition_proportion) |
| 252 | for r in range(self.rounds): | 277 | for r in range(self.rounds): |
| @@ -258,10 +283,17 @@ class CrossValidation: | @@ -258,10 +283,17 @@ class CrossValidation: | ||
| 258 | logging.critical("Empty cross_item_score.") | 283 | logging.critical("Empty cross_item_score.") |
| 259 | raise Error | 284 | raise Error |
| 260 | round_partition[random_key] = cross_item_score.pop(random_key) | 285 | round_partition[random_key] = cross_item_score.pop(random_key) |
| 286 | + #logging.debug("Round partition: %s",str(round_partition)) | ||
| 287 | + #logging.debug("Cross item-score: %s",str(cross_item_score)) | ||
| 261 | round_user = User(cross_item_score) | 288 | round_user = User(cross_item_score) |
| 262 | - predicted_result = self.recommender.get_recommendation(round_user) | ||
| 263 | - real_result = RecommendationResult(round_partition,len(round_partition)) | ||
| 264 | - evaluation = Evaluation(predicted_result,real_result) | 289 | + result_size = int(self.recommender.items_repository.get_doccount()* |
| 290 | + self.result_proportion) | ||
| 291 | + predicted_result = self.recommender.get_recommendation(round_user,result_size) | ||
| 292 | + print len(round_partition) | ||
| 293 | + real_result = RecommendationResult(round_partition) | ||
| 294 | + #logging.debug("Predicted result: %s",predicted_result) | ||
| 295 | + evaluation = Evaluation(predicted_result,real_result, | ||
| 296 | + self.recommender.items_repository.get_doccount()) | ||
| 265 | for metric in self.metrics_list: | 297 | for metric in self.metrics_list: |
| 266 | result = evaluation.run(metric) | 298 | result = evaluation.run(metric) |
| 267 | self.cross_results[metric.desc].append(result) | 299 | self.cross_results[metric.desc].append(result) |
| @@ -269,3 +301,26 @@ class CrossValidation: | @@ -269,3 +301,26 @@ class CrossValidation: | ||
| 269 | item,score = round_partition.popitem() | 301 | item,score = round_partition.popitem() |
| 270 | cross_item_score[item] = score | 302 | cross_item_score[item] = score |
| 271 | 303 | ||
| 304 | + def __str__(self): | ||
| 305 | + """ | ||
| 306 | + String representation of the object. | ||
| 307 | + """ | ||
| 308 | + str = "\n" | ||
| 309 | + metrics_desc = "" | ||
| 310 | + for metric in self.metrics_list: | ||
| 311 | + metrics_desc += "%s|" % (metric.desc) | ||
| 312 | + str += "| Round |%s\n" % metrics_desc | ||
| 313 | + for r in range(self.rounds): | ||
| 314 | + metrics_result = "" | ||
| 315 | + for metric in self.metrics_list: | ||
| 316 | + metrics_result += (" %2.1f%% |" % | ||
| 317 | + (self.cross_results[metric.desc][r]*100)) | ||
| 318 | + str += "| %d |%s\n" % (r,metrics_result) | ||
| 319 | + metrics_mean = "" | ||
| 320 | + for metric in self.metrics_list: | ||
| 321 | + mean = float(sum(self.cross_results[metric.desc]) / | ||
| 322 | + len(self.cross_results[metric.desc])) | ||
| 323 | + metrics_mean += " %2.1f%% |" % (mean*100) | ||
| 324 | + str += "| Mean |%s\n" % (metrics_mean) | ||
| 325 | + return str | ||
| 326 | + |
src/recommender.py
| @@ -45,13 +45,15 @@ class RecommendationResult: | @@ -45,13 +45,15 @@ class RecommendationResult: | ||
| 45 | str += "%2d: %s\n" % (i,result[i][0]) | 45 | str += "%2d: %s\n" % (i,result[i][0]) |
| 46 | return str | 46 | return str |
| 47 | 47 | ||
| 48 | - def get_prediction(self,limit=20): | 48 | + def get_prediction(self,limit=0): |
| 49 | """ | 49 | """ |
| 50 | Return prediction based on recommendation size (number of items). | 50 | Return prediction based on recommendation size (number of items). |
| 51 | """ | 51 | """ |
| 52 | - if limit > self.size: limit = self.size | ||
| 53 | sorted_result = sorted(self.item_score.items(), | 52 | sorted_result = sorted(self.item_score.items(), |
| 54 | key=operator.itemgetter(1)) | 53 | key=operator.itemgetter(1)) |
| 54 | + if not limit or limit > self.size: | ||
| 55 | + limit = self.size | ||
| 56 | + | ||
| 55 | return list(reversed(sorted_result[-limit:])) | 57 | return list(reversed(sorted_result[-limit:])) |
| 56 | 58 | ||
| 57 | class Recommender: | 59 | class Recommender: |
| @@ -63,13 +65,12 @@ class Recommender: | @@ -63,13 +65,12 @@ class Recommender: | ||
| 63 | Set initial parameters. | 65 | Set initial parameters. |
| 64 | """ | 66 | """ |
| 65 | self.items_repository = xapian.Database(cfg.axi) | 67 | self.items_repository = xapian.Database(cfg.axi) |
| 66 | - self.users_repository = data.PopconXapianIndex(cfg) | ||
| 67 | - #self.clustered_users_repository = data.PopconXapianIndex(cfg) | ||
| 68 | self.set_strategy(cfg.strategy) | 68 | self.set_strategy(cfg.strategy) |
| 69 | if cfg.weight == "bm25": | 69 | if cfg.weight == "bm25": |
| 70 | self.weight = xapian.BM25Weight() | 70 | self.weight = xapian.BM25Weight() |
| 71 | else: | 71 | else: |
| 72 | self.weight = xapian.TradWeight() | 72 | self.weight = xapian.TradWeight() |
| 73 | + self.cfg = cfg | ||
| 73 | 74 | ||
| 74 | def set_strategy(self,strategy_str): | 75 | def set_strategy(self,strategy_str): |
| 75 | """ | 76 | """ |
| @@ -83,6 +84,7 @@ class Recommender: | @@ -83,6 +84,7 @@ class Recommender: | ||
| 83 | self.strategy = strategy.ContentBasedStrategy("desc") | 84 | self.strategy = strategy.ContentBasedStrategy("desc") |
| 84 | if strategy_str == "col": | 85 | if strategy_str == "col": |
| 85 | self.strategy = strategy.CollaborativeStrategy(20) | 86 | self.strategy = strategy.CollaborativeStrategy(20) |
| 87 | + self.users_repository = data.PopconXapianIndex(self.cfg) | ||
| 86 | 88 | ||
| 87 | def get_recommendation(self,user,result_size=20): | 89 | def get_recommendation(self,user,result_size=20): |
| 88 | """ | 90 | """ |
src/tests/data_tests.py
| @@ -71,13 +71,13 @@ class PopconXapianIndexTests(unittest2.TestCase): | @@ -71,13 +71,13 @@ class PopconXapianIndexTests(unittest2.TestCase): | ||
| 71 | 71 | ||
| 72 | def test_reindex(self): | 72 | def test_reindex(self): |
| 73 | # force reindex with no clustering | 73 | # force reindex with no clustering |
| 74 | - self.cfg.index_mode = "10" | 74 | + self.cfg.index_mode = "reindex" |
| 75 | pxi = PopconXapianIndex(self.cfg) | 75 | pxi = PopconXapianIndex(self.cfg) |
| 76 | self.assertEqual(pxi.get_metadata("old"),"") | 76 | self.assertEqual(pxi.get_metadata("old"),"") |
| 77 | 77 | ||
| 78 | def test_clustering(self): | 78 | def test_clustering(self): |
| 79 | # force reindex with clustering | 79 | # force reindex with clustering |
| 80 | - self.cfg.index_mode = "11" | 80 | + self.cfg.index_mode = "cluster" |
| 81 | pxi = PopconXapianIndex(self.cfg) | 81 | pxi = PopconXapianIndex(self.cfg) |
| 82 | self.assertEqual(pxi.source_dir,self.cfg.clusters_dir) | 82 | self.assertEqual(pxi.source_dir,self.cfg.clusters_dir) |
| 83 | all_submissions = [submissions for (root, dirs, submissions) in | 83 | all_submissions = [submissions for (root, dirs, submissions) in |
| @@ -95,6 +95,13 @@ class PopconXapianIndexTests(unittest2.TestCase): | @@ -95,6 +95,13 @@ class PopconXapianIndexTests(unittest2.TestCase): | ||
| 95 | sum([len(submissions) for submissions in | 95 | sum([len(submissions) for submissions in |
| 96 | all_submissions])) | 96 | all_submissions])) |
| 97 | 97 | ||
| 98 | + def test_recluster(self): | ||
| 99 | + # force reindexing and clustering | ||
| 100 | + self.cfg.index_mode = "recluster" | ||
| 101 | + self.cfg.k_medoids = 2 | ||
| 102 | + pxi = PopconXapianIndex(self.cfg) | ||
| 103 | + self.assertEqual(pxi.source_dir,self.cfg.clusters_dir) | ||
| 104 | + self.assertEqual(pxi.get_doccount(),2) | ||
| 98 | 105 | ||
| 99 | if __name__ == '__main__': | 106 | if __name__ == '__main__': |
| 100 | unittest2.main() | 107 | unittest2.main() |
src/tests/test_data/popcon_dir/test_popcon_0
| 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_0 ARCH:i386 POPCONVER:1.52 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_0 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | ||
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | 2 | +1309407475 1303670994 gimp /usr/bin/perl |
| 3 | +1309407451 1303670982 inkscape /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | +1309407450 1303670973 imagination /lib/ld-2.11.2.so | ||
| 5 | 1309407434 1295654294 dash /bin/dash | 5 | 1309407434 1295654294 dash /bin/dash |
| 6 | 0 0 libusbmuxd1 <NOFILES> | 6 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 |
src/tests/test_data/popcon_dir/test_popcon_1
| 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_1 ARCH:i386 POPCONVER:1.52 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_1 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | ||
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | 2 | +1309407475 1303670994 gimp /usr/bin/perl |
| 5 | 1309407434 1295654294 dash /bin/dash | 3 | 1309407434 1295654294 dash /bin/dash |
| 6 | 0 0 libusbmuxd1 <NOFILES> | 4 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | 5 | END-POPULARITY-CONTEST-0 TIME:1309407492 |
src/tests/test_data/popcon_dir/test_popcon_2
| 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_2 ARCH:i386 POPCONVER:1.52 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_2 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | ||
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | 2 | +1309407475 1303670994 iceweasel /usr/bin/perl |
| 3 | +1309407451 1303670982 python /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so | 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so |
| 5 | 1309407434 1295654294 dash /bin/dash | 5 | 1309407434 1295654294 dash /bin/dash |
| 6 | 0 0 libusbmuxd1 <NOFILES> | 6 | 0 0 libusbmuxd1 <NOFILES> |
src/tests/test_data/popcon_dir/test_popcon_3
| 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_3 ARCH:i386 POPCONVER:1.52 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_3 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | ||
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | 2 | +1309407475 1303670994 eog /usr/bin/perl |
| 3 | +1309407451 1303670982 nautilus /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so | 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so |
| 5 | -1309407434 1295654294 dash /bin/dash | ||
| 6 | 0 0 libusbmuxd1 <NOFILES> | 5 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | 6 | END-POPULARITY-CONTEST-0 TIME:1309407492 |
src/tests/test_data/popcon_dir/test_popcon_4
| 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_4 ARCH:i386 POPCONVER:1.52 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_4 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | ||
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | 2 | +1309407475 1303670994 konqueror /usr/bin/perl |
| 3 | +1309407451 1303670982 kedit /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so | 4 | 1309407450 1303670973 libc6 /lib/ld-2.11.2.so |
| 5 | -1309407434 1295654294 dash /bin/dash | ||
| 6 | 0 0 libusbmuxd1 <NOFILES> | 5 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | 6 | END-POPULARITY-CONTEST-0 TIME:1309407492 |
src/tests/test_data/popcon_dir/test_popcon_5
| 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_5 ARCH:i386 POPCONVER:1.52 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_5 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | ||
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | 2 | +1309407475 1303670994 konqueror /usr/bin/perl |
| 5 | 1309407434 1295654294 dash /bin/dash | 3 | 1309407434 1295654294 dash /bin/dash |
| 6 | 0 0 libusbmuxd1 <NOFILES> | 4 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | 5 | END-POPULARITY-CONTEST-0 TIME:1309407492 |
src/tests/test_data/popcon_dir/test_popcon_6
| 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_6 ARCH:i386 POPCONVER:1.52 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_6 ARCH:i386 POPCONVER:1.52 |
| 2 | 1309407475 1303670994 perl-base /usr/bin/perl | 2 | 1309407475 1303670994 perl-base /usr/bin/perl |
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | 3 | +1309407451 1303670982 eog /lib/i686/cmov/libc-2.11.2.so |
| 4 | +1309407450 1303670973 nautilus /lib/ld-2.11.2.so | ||
| 5 | 1309407434 1295654294 dash /bin/dash | 5 | 1309407434 1295654294 dash /bin/dash |
| 6 | 0 0 libusbmuxd1 <NOFILES> | 6 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 |
src/tests/test_data/popcon_dir/test_popcon_7
| 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_7 ARCH:i386 POPCONVER:1.52 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_7 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | ||
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | 2 | +1309407475 1303670994 apticron /usr/bin/perl |
| 3 | +1309407451 1303670982 aptitude /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | +1309407450 1303670973 apt /lib/ld-2.11.2.so | ||
| 5 | 1309407434 1295654294 dash /bin/dash | 5 | 1309407434 1295654294 dash /bin/dash |
| 6 | 0 0 libusbmuxd1 <NOFILES> | 6 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 |
src/tests/test_data/popcon_dir/test_popcon_8
| 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_8 ARCH:i386 POPCONVER:1.52 | 1 | POPULARITY-CONTEST-0 TIME:1309407492 ID:8b44fcdbcf676e711a153d5db0test_8 ARCH:i386 POPCONVER:1.52 |
| 2 | -1309407475 1303670994 perl-base /usr/bin/perl | ||
| 3 | -1309407451 1303670982 libc6-i686 /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | -1309407450 1303670973 libc6 /lib/ld-2.11.2.so | 2 | +1309407475 1303670994 apticron /usr/bin/perl |
| 3 | +1309407451 1303670982 eog /lib/i686/cmov/libc-2.11.2.so | ||
| 4 | +1309407450 1303670973 nautilus /lib/ld-2.11.2.so | ||
| 5 | 1309407434 1295654294 dash /bin/dash | 5 | 1309407434 1295654294 dash /bin/dash |
| 6 | 0 0 libusbmuxd1 <NOFILES> | 6 | 0 0 libusbmuxd1 <NOFILES> |
| 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 | 7 | END-POPULARITY-CONTEST-0 TIME:1309407492 |
src/user.py
| @@ -152,6 +152,24 @@ class User: | @@ -152,6 +152,24 @@ class User: | ||
| 152 | desc_profile = self.desc_profile(items_repository,size)[:size/2] | 152 | desc_profile = self.desc_profile(items_repository,size)[:size/2] |
| 153 | return tag_profile+desc_profile | 153 | return tag_profile+desc_profile |
| 154 | 154 | ||
| 155 | + def app_pkg_profile(self,axi): | ||
| 156 | + """ | ||
| 157 | + Return list of packages that are applications. | ||
| 158 | + """ | ||
| 159 | + old_profile_size = len(self.pkg_profile) | ||
| 160 | + for p in self.pkg_profile[:]: #iterate list copy | ||
| 161 | + tags = data.axi_search_pkg_tags(axi,p) | ||
| 162 | + try: | ||
| 163 | + | ||
| 164 | + if not "XTrole::program" in tags: | ||
| 165 | + self.pkg_profile.remove(p) | ||
| 166 | + except: | ||
| 167 | + logging.debug("Package not found in axi: %s" % p) | ||
| 168 | + profile_size = len(self.pkg_profile) | ||
| 169 | + logging.debug("App package profile: reduced packages profile size \ | ||
| 170 | + from %d to %d." % (old_profile_size, profile_size)) | ||
| 171 | + return self.pkg_profile | ||
| 172 | + | ||
| 155 | def maximal_pkg_profile(self): | 173 | def maximal_pkg_profile(self): |
| 156 | """ | 174 | """ |
| 157 | Return list of packages that are not dependence of any other package in | 175 | Return list of packages that are not dependence of any other package in |