Commit b48d6eca8ef182ab26f13804f0ac48deed722f70
1 parent
5011c245
Exists in
master
and in
1 other branch
New index_mode scheme allow the following options: 'old', 'reindex', 'cluster' a…
…nd 'recluster'. Refactored k_medoids_clustering.
Showing
1 changed file
with
30 additions
and
33 deletions
Show diff stats
src/data.py
| ... | ... | @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): |
| 129 | 129 | """ |
| 130 | 130 | self.axi = xapian.Database(cfg.axi) |
| 131 | 131 | self.path = os.path.expanduser(cfg.popcon_index) |
| 132 | - if cfg.index_mode.startswith("1") or not self.load_index(): | |
| 132 | + if not cfg.index_mode == "old" or not self.load_index(): | |
| 133 | 133 | if not os.path.exists(cfg.popcon_dir): |
| 134 | 134 | os.makedirs(cfg.popcon_dir) |
| 135 | 135 | if not os.listdir(cfg.popcon_dir): |
| 136 | 136 | logging.critical("Popcon dir seems to be empty.") |
| 137 | 137 | raise Error |
| 138 | - if cfg.index_mode == "10": | |
| 138 | + if cfg.index_mode == "reindex": | |
| 139 | 139 | self.source_dir = os.path.expanduser(cfg.popcon_dir) |
| 140 | 140 | else: |
| 141 | 141 | self.source_dir = os.path.expanduser(cfg.clusters_dir) |
| 142 | 142 | if not os.path.exists(cfg.clusters_dir): |
| 143 | 143 | os.makedirs(cfg.clusters_dir) |
| 144 | - if not os.listdir(cfg.clusters_dir): | |
| 145 | - distance = JaccardDistance() | |
| 144 | + if not os.listdir(cfg.clusters_dir) or \ | |
| 145 | + cfg.index_mode == "recluster": | |
| 146 | + shutil.rmtree(cfg.clusters_dir,1) | |
| 147 | + os.makedirs(cfg.clusters_dir) | |
| 146 | 148 | logging.info("Clustering popcon submissions from \'%s\'" |
| 147 | 149 | % cfg.popcon_dir) |
| 148 | 150 | logging.info("Clusters will be placed at \'%s\'" |
| 149 | 151 | % cfg.clusters_dir) |
| 152 | + distance = JaccardDistance() | |
| 150 | 153 | data = self.get_submissions(cfg.popcon_dir) |
| 151 | - if cfg.clustering == "Hierarchical": | |
| 152 | - self.hierarchical_clustering(data,cfg.clusters_dir, | |
| 153 | - distance) | |
| 154 | - else: | |
| 155 | - self.kmedoids_clustering(data,cfg.clusters_dir, | |
| 156 | - distance) | |
| 154 | + self.cluster_dispersion = \ | |
| 155 | + self.kmedoids_clustering(data, cfg.clusters_dir, | |
| 156 | + distance, cfg.k_medoids) | |
| 157 | + logging.info("Clusters dispersion: %f.2", | |
| 158 | + self.cluster_dispersion) | |
| 159 | + else: | |
| 160 | + logging.info("Using clusters from \'%s\'" % | |
| 161 | + cfg.clusters_dir) | |
| 157 | 162 | self.build_index() |
| 158 | 163 | |
| 159 | 164 | def __str__(self): |
| ... | ... | @@ -167,6 +172,7 @@ class PopconXapianIndex(xapian.WritableDatabase): |
| 167 | 172 | logging.info("Opening existing popcon xapian index at \'%s\'" |
| 168 | 173 | % self.path) |
| 169 | 174 | xapian.Database.__init__(self,self.path) |
| 175 | + return 1 | |
| 170 | 176 | except xapian.DatabaseError: |
| 171 | 177 | logging.info("Could not open popcon index.") |
| 172 | 178 | return 0 |
| ... | ... | @@ -222,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): |
| 222 | 228 | submissions.append(submission) |
| 223 | 229 | return submissions |
| 224 | 230 | |
| 225 | - def hierarchical_clustering(self,data,clusters_dir,distance,k=10): | |
| 226 | - """ | |
| 227 | - Select popcon submissions from popcon_dir and place them at clusters_dir | |
| 228 | - """ | |
| 229 | - cl = cluster.HierarchicalClustering(data, lambda x,y: | |
| 230 | - distance(x.packages.keys(), | |
| 231 | - y.packages.keys())) | |
| 232 | - clusters = cl.getlevel(0.5) | |
| 233 | - for c in clusters: | |
| 234 | - print "cluster" | |
| 235 | - for submission in c: | |
| 236 | - print submission.user_id | |
| 237 | - | |
| 238 | - def kmedoids_clustering(self,data,clusters_dir,distance,k=10): | |
| 231 | + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): | |
| 239 | 232 | clusters = KMedoidsClustering(data,lambda x,y: |
| 240 | 233 | distance(x.packages.keys(), |
| 241 | 234 | y.packages.keys())) |
| 242 | - medoids = clusters.getMedoids(2) | |
| 235 | + medoids,dispersion = clusters.getMedoids(k_medoids) | |
| 243 | 236 | for submission in medoids: |
| 244 | 237 | shutil.copyfile(submission.path,os.path.join(clusters_dir, |
| 245 | 238 | submission.user_id)) |
| 239 | + return dispersion | |
| 246 | 240 | |
| 247 | 241 | class KMedoidsClustering(cluster.KMeansClustering): |
| 248 | 242 | |
| 249 | - def __init__(self,data,distance): | |
| 250 | - if len(data)<100: | |
| 243 | + def __init__(self,data,distance,max_data=100): | |
| 244 | + if len(data)<max_data: | |
| 251 | 245 | data_sample = data |
| 252 | 246 | else: |
| 253 | - data_sample = random.sample(data,100) | |
| 247 | + data_sample = random.sample(data,max_data) | |
| 254 | 248 | cluster.KMeansClustering.__init__(self, data_sample, distance) |
| 255 | 249 | self.distanceMatrix = {} |
| 256 | 250 | for submission in self._KMeansClustering__data: |
| ... | ... | @@ -285,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): |
| 285 | 279 | logging.debug("medoidDistance: %f" % medoidDistance) |
| 286 | 280 | logging.debug("Cluster medoid: [%d] %s" % (medoid, |
| 287 | 281 | cluster[medoid].user_id)) |
| 288 | - return cluster[medoid] | |
| 282 | + return (cluster[medoid],medoidDistance) | |
| 289 | 283 | |
| 290 | 284 | def assign_item(self, item, origin): |
| 291 | 285 | """ |
| ... | ... | @@ -293,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): |
| 293 | 287 | """ |
| 294 | 288 | closest_cluster = origin |
| 295 | 289 | for cluster in self._KMeansClustering__clusters: |
| 296 | - if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)): | |
| 290 | + if self.distance(item,self.getMedoid(cluster)[0]) < \ | |
| 291 | + self.distance(item,self.getMedoid(closest_cluster)[0]): | |
| 297 | 292 | closest_cluster = cluster |
| 298 | 293 | |
| 299 | 294 | if closest_cluster != origin: |
| ... | ... | @@ -307,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): |
| 307 | 302 | """ |
| 308 | 303 | Generate n clusters and return their medoids. |
| 309 | 304 | """ |
| 310 | - medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | |
| 311 | - logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids]) | |
| 312 | - return medoids | |
| 305 | + medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | |
| 306 | + medoids = [m[0] for m in medoids_distances] | |
| 307 | + dispersion = sum([m[1] for m in medoids_distances]) | |
| 308 | + logging.info("Clustering completed and the following medoids were found: %s" % [c.user_id for c in medoids]) | |
| 309 | + return medoids,dispersion | ... | ... |