Commit b48d6eca8ef182ab26f13804f0ac48deed722f70
1 parent
5011c245
Exists in
master
and in
1 other branch
New index_mode scheme allow the following options: 'old', 'reindex', 'cluster' a…
…nd 'recluster'. Refactored k_medoids_clustering.
Showing
1 changed file
with
30 additions
and
33 deletions
Show diff stats
src/data.py
| @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
| 129 | """ | 129 | """ |
| 130 | self.axi = xapian.Database(cfg.axi) | 130 | self.axi = xapian.Database(cfg.axi) |
| 131 | self.path = os.path.expanduser(cfg.popcon_index) | 131 | self.path = os.path.expanduser(cfg.popcon_index) |
| 132 | - if cfg.index_mode.startswith("1") or not self.load_index(): | 132 | + if not cfg.index_mode == "old" or not self.load_index(): |
| 133 | if not os.path.exists(cfg.popcon_dir): | 133 | if not os.path.exists(cfg.popcon_dir): |
| 134 | os.makedirs(cfg.popcon_dir) | 134 | os.makedirs(cfg.popcon_dir) |
| 135 | if not os.listdir(cfg.popcon_dir): | 135 | if not os.listdir(cfg.popcon_dir): |
| 136 | logging.critical("Popcon dir seems to be empty.") | 136 | logging.critical("Popcon dir seems to be empty.") |
| 137 | raise Error | 137 | raise Error |
| 138 | - if cfg.index_mode == "10": | 138 | + if cfg.index_mode == "reindex": |
| 139 | self.source_dir = os.path.expanduser(cfg.popcon_dir) | 139 | self.source_dir = os.path.expanduser(cfg.popcon_dir) |
| 140 | else: | 140 | else: |
| 141 | self.source_dir = os.path.expanduser(cfg.clusters_dir) | 141 | self.source_dir = os.path.expanduser(cfg.clusters_dir) |
| 142 | if not os.path.exists(cfg.clusters_dir): | 142 | if not os.path.exists(cfg.clusters_dir): |
| 143 | os.makedirs(cfg.clusters_dir) | 143 | os.makedirs(cfg.clusters_dir) |
| 144 | - if not os.listdir(cfg.clusters_dir): | ||
| 145 | - distance = JaccardDistance() | 144 | + if not os.listdir(cfg.clusters_dir) or \ |
| 145 | + cfg.index_mode == "recluster": | ||
| 146 | + shutil.rmtree(cfg.clusters_dir,1) | ||
| 147 | + os.makedirs(cfg.clusters_dir) | ||
| 146 | logging.info("Clustering popcon submissions from \'%s\'" | 148 | logging.info("Clustering popcon submissions from \'%s\'" |
| 147 | % cfg.popcon_dir) | 149 | % cfg.popcon_dir) |
| 148 | logging.info("Clusters will be placed at \'%s\'" | 150 | logging.info("Clusters will be placed at \'%s\'" |
| 149 | % cfg.clusters_dir) | 151 | % cfg.clusters_dir) |
| 152 | + distance = JaccardDistance() | ||
| 150 | data = self.get_submissions(cfg.popcon_dir) | 153 | data = self.get_submissions(cfg.popcon_dir) |
| 151 | - if cfg.clustering == "Hierarchical": | ||
| 152 | - self.hierarchical_clustering(data,cfg.clusters_dir, | ||
| 153 | - distance) | ||
| 154 | - else: | ||
| 155 | - self.kmedoids_clustering(data,cfg.clusters_dir, | ||
| 156 | - distance) | 154 | + self.cluster_dispersion = \ |
| 155 | + self.kmedoids_clustering(data, cfg.clusters_dir, | ||
| 156 | + distance, cfg.k_medoids) | ||
| 157 | + logging.info("Clusters dispersion: %f.2", | ||
| 158 | + self.cluster_dispersion) | ||
| 159 | + else: | ||
| 160 | + logging.info("Using clusters from \'%s\'" % | ||
| 161 | + cfg.clusters_dir) | ||
| 157 | self.build_index() | 162 | self.build_index() |
| 158 | 163 | ||
| 159 | def __str__(self): | 164 | def __str__(self): |
| @@ -167,6 +172,7 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -167,6 +172,7 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
| 167 | logging.info("Opening existing popcon xapian index at \'%s\'" | 172 | logging.info("Opening existing popcon xapian index at \'%s\'" |
| 168 | % self.path) | 173 | % self.path) |
| 169 | xapian.Database.__init__(self,self.path) | 174 | xapian.Database.__init__(self,self.path) |
| 175 | + return 1 | ||
| 170 | except xapian.DatabaseError: | 176 | except xapian.DatabaseError: |
| 171 | logging.info("Could not open popcon index.") | 177 | logging.info("Could not open popcon index.") |
| 172 | return 0 | 178 | return 0 |
| @@ -222,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -222,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
| 222 | submissions.append(submission) | 228 | submissions.append(submission) |
| 223 | return submissions | 229 | return submissions |
| 224 | 230 | ||
| 225 | - def hierarchical_clustering(self,data,clusters_dir,distance,k=10): | ||
| 226 | - """ | ||
| 227 | - Select popcon submissions from popcon_dir and place them at clusters_dir | ||
| 228 | - """ | ||
| 229 | - cl = cluster.HierarchicalClustering(data, lambda x,y: | ||
| 230 | - distance(x.packages.keys(), | ||
| 231 | - y.packages.keys())) | ||
| 232 | - clusters = cl.getlevel(0.5) | ||
| 233 | - for c in clusters: | ||
| 234 | - print "cluster" | ||
| 235 | - for submission in c: | ||
| 236 | - print submission.user_id | ||
| 237 | - | ||
| 238 | - def kmedoids_clustering(self,data,clusters_dir,distance,k=10): | 231 | + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): |
| 239 | clusters = KMedoidsClustering(data,lambda x,y: | 232 | clusters = KMedoidsClustering(data,lambda x,y: |
| 240 | distance(x.packages.keys(), | 233 | distance(x.packages.keys(), |
| 241 | y.packages.keys())) | 234 | y.packages.keys())) |
| 242 | - medoids = clusters.getMedoids(2) | 235 | + medoids,dispersion = clusters.getMedoids(k_medoids) |
| 243 | for submission in medoids: | 236 | for submission in medoids: |
| 244 | shutil.copyfile(submission.path,os.path.join(clusters_dir, | 237 | shutil.copyfile(submission.path,os.path.join(clusters_dir, |
| 245 | submission.user_id)) | 238 | submission.user_id)) |
| 239 | + return dispersion | ||
| 246 | 240 | ||
| 247 | class KMedoidsClustering(cluster.KMeansClustering): | 241 | class KMedoidsClustering(cluster.KMeansClustering): |
| 248 | 242 | ||
| 249 | - def __init__(self,data,distance): | ||
| 250 | - if len(data)<100: | 243 | + def __init__(self,data,distance,max_data=100): |
| 244 | + if len(data)<max_data: | ||
| 251 | data_sample = data | 245 | data_sample = data |
| 252 | else: | 246 | else: |
| 253 | - data_sample = random.sample(data,100) | 247 | + data_sample = random.sample(data,max_data) |
| 254 | cluster.KMeansClustering.__init__(self, data_sample, distance) | 248 | cluster.KMeansClustering.__init__(self, data_sample, distance) |
| 255 | self.distanceMatrix = {} | 249 | self.distanceMatrix = {} |
| 256 | for submission in self._KMeansClustering__data: | 250 | for submission in self._KMeansClustering__data: |
| @@ -285,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): | @@ -285,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): | ||
| 285 | logging.debug("medoidDistance: %f" % medoidDistance) | 279 | logging.debug("medoidDistance: %f" % medoidDistance) |
| 286 | logging.debug("Cluster medoid: [%d] %s" % (medoid, | 280 | logging.debug("Cluster medoid: [%d] %s" % (medoid, |
| 287 | cluster[medoid].user_id)) | 281 | cluster[medoid].user_id)) |
| 288 | - return cluster[medoid] | 282 | + return (cluster[medoid],medoidDistance) |
| 289 | 283 | ||
| 290 | def assign_item(self, item, origin): | 284 | def assign_item(self, item, origin): |
| 291 | """ | 285 | """ |
| @@ -293,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | @@ -293,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | ||
| 293 | """ | 287 | """ |
| 294 | closest_cluster = origin | 288 | closest_cluster = origin |
| 295 | for cluster in self._KMeansClustering__clusters: | 289 | for cluster in self._KMeansClustering__clusters: |
| 296 | - if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)): | 290 | + if self.distance(item,self.getMedoid(cluster)[0]) < \ |
| 291 | + self.distance(item,self.getMedoid(closest_cluster)[0]): | ||
| 297 | closest_cluster = cluster | 292 | closest_cluster = cluster |
| 298 | 293 | ||
| 299 | if closest_cluster != origin: | 294 | if closest_cluster != origin: |
| @@ -307,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | @@ -307,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | ||
| 307 | """ | 302 | """ |
| 308 | Generate n clusters and return their medoids. | 303 | Generate n clusters and return their medoids. |
| 309 | """ | 304 | """ |
| 310 | - medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | ||
| 311 | - logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids]) | ||
| 312 | - return medoids | 305 | + medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)] |
| 306 | + medoids = [m[0] for m in medoids_distances] | ||
| 307 | + dispersion = sum([m[1] for m in medoids_distances]) | ||
| 308 | + logging.info("Clustering completed and the following medoids were found: %s" % [c.user_id for c in medoids]) | ||
| 309 | + return medoids,dispersion |