Commit b48d6eca8ef182ab26f13804f0ac48deed722f70
1 parent
5011c245
Exists in
master
and in
1 other branch
New index_mode scheme allow the following options: 'old', 'reindex', 'cluster' a…
…nd 'recluster'. Refactored k_medoids_clustering.
Showing
1 changed file
with
30 additions
and
33 deletions
Show diff stats
src/data.py
@@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
129 | """ | 129 | """ |
130 | self.axi = xapian.Database(cfg.axi) | 130 | self.axi = xapian.Database(cfg.axi) |
131 | self.path = os.path.expanduser(cfg.popcon_index) | 131 | self.path = os.path.expanduser(cfg.popcon_index) |
132 | - if cfg.index_mode.startswith("1") or not self.load_index(): | 132 | + if not cfg.index_mode == "old" or not self.load_index(): |
133 | if not os.path.exists(cfg.popcon_dir): | 133 | if not os.path.exists(cfg.popcon_dir): |
134 | os.makedirs(cfg.popcon_dir) | 134 | os.makedirs(cfg.popcon_dir) |
135 | if not os.listdir(cfg.popcon_dir): | 135 | if not os.listdir(cfg.popcon_dir): |
136 | logging.critical("Popcon dir seems to be empty.") | 136 | logging.critical("Popcon dir seems to be empty.") |
137 | raise Error | 137 | raise Error |
138 | - if cfg.index_mode == "10": | 138 | + if cfg.index_mode == "reindex": |
139 | self.source_dir = os.path.expanduser(cfg.popcon_dir) | 139 | self.source_dir = os.path.expanduser(cfg.popcon_dir) |
140 | else: | 140 | else: |
141 | self.source_dir = os.path.expanduser(cfg.clusters_dir) | 141 | self.source_dir = os.path.expanduser(cfg.clusters_dir) |
142 | if not os.path.exists(cfg.clusters_dir): | 142 | if not os.path.exists(cfg.clusters_dir): |
143 | os.makedirs(cfg.clusters_dir) | 143 | os.makedirs(cfg.clusters_dir) |
144 | - if not os.listdir(cfg.clusters_dir): | ||
145 | - distance = JaccardDistance() | 144 | + if not os.listdir(cfg.clusters_dir) or \ |
145 | + cfg.index_mode == "recluster": | ||
146 | + shutil.rmtree(cfg.clusters_dir,1) | ||
147 | + os.makedirs(cfg.clusters_dir) | ||
146 | logging.info("Clustering popcon submissions from \'%s\'" | 148 | logging.info("Clustering popcon submissions from \'%s\'" |
147 | % cfg.popcon_dir) | 149 | % cfg.popcon_dir) |
148 | logging.info("Clusters will be placed at \'%s\'" | 150 | logging.info("Clusters will be placed at \'%s\'" |
149 | % cfg.clusters_dir) | 151 | % cfg.clusters_dir) |
152 | + distance = JaccardDistance() | ||
150 | data = self.get_submissions(cfg.popcon_dir) | 153 | data = self.get_submissions(cfg.popcon_dir) |
151 | - if cfg.clustering == "Hierarchical": | ||
152 | - self.hierarchical_clustering(data,cfg.clusters_dir, | ||
153 | - distance) | ||
154 | - else: | ||
155 | - self.kmedoids_clustering(data,cfg.clusters_dir, | ||
156 | - distance) | 154 | + self.cluster_dispersion = \ |
155 | + self.kmedoids_clustering(data, cfg.clusters_dir, | ||
156 | + distance, cfg.k_medoids) | ||
157 | + logging.info("Clusters dispersion: %f.2", | ||
158 | + self.cluster_dispersion) | ||
159 | + else: | ||
160 | + logging.info("Using clusters from \'%s\'" % | ||
161 | + cfg.clusters_dir) | ||
157 | self.build_index() | 162 | self.build_index() |
158 | 163 | ||
159 | def __str__(self): | 164 | def __str__(self): |
@@ -167,6 +172,7 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -167,6 +172,7 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
167 | logging.info("Opening existing popcon xapian index at \'%s\'" | 172 | logging.info("Opening existing popcon xapian index at \'%s\'" |
168 | % self.path) | 173 | % self.path) |
169 | xapian.Database.__init__(self,self.path) | 174 | xapian.Database.__init__(self,self.path) |
175 | + return 1 | ||
170 | except xapian.DatabaseError: | 176 | except xapian.DatabaseError: |
171 | logging.info("Could not open popcon index.") | 177 | logging.info("Could not open popcon index.") |
172 | return 0 | 178 | return 0 |
@@ -222,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -222,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
222 | submissions.append(submission) | 228 | submissions.append(submission) |
223 | return submissions | 229 | return submissions |
224 | 230 | ||
225 | - def hierarchical_clustering(self,data,clusters_dir,distance,k=10): | ||
226 | - """ | ||
227 | - Select popcon submissions from popcon_dir and place them at clusters_dir | ||
228 | - """ | ||
229 | - cl = cluster.HierarchicalClustering(data, lambda x,y: | ||
230 | - distance(x.packages.keys(), | ||
231 | - y.packages.keys())) | ||
232 | - clusters = cl.getlevel(0.5) | ||
233 | - for c in clusters: | ||
234 | - print "cluster" | ||
235 | - for submission in c: | ||
236 | - print submission.user_id | ||
237 | - | ||
238 | - def kmedoids_clustering(self,data,clusters_dir,distance,k=10): | 231 | + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): |
239 | clusters = KMedoidsClustering(data,lambda x,y: | 232 | clusters = KMedoidsClustering(data,lambda x,y: |
240 | distance(x.packages.keys(), | 233 | distance(x.packages.keys(), |
241 | y.packages.keys())) | 234 | y.packages.keys())) |
242 | - medoids = clusters.getMedoids(2) | 235 | + medoids,dispersion = clusters.getMedoids(k_medoids) |
243 | for submission in medoids: | 236 | for submission in medoids: |
244 | shutil.copyfile(submission.path,os.path.join(clusters_dir, | 237 | shutil.copyfile(submission.path,os.path.join(clusters_dir, |
245 | submission.user_id)) | 238 | submission.user_id)) |
239 | + return dispersion | ||
246 | 240 | ||
247 | class KMedoidsClustering(cluster.KMeansClustering): | 241 | class KMedoidsClustering(cluster.KMeansClustering): |
248 | 242 | ||
249 | - def __init__(self,data,distance): | ||
250 | - if len(data)<100: | 243 | + def __init__(self,data,distance,max_data=100): |
244 | + if len(data)<max_data: | ||
251 | data_sample = data | 245 | data_sample = data |
252 | else: | 246 | else: |
253 | - data_sample = random.sample(data,100) | 247 | + data_sample = random.sample(data,max_data) |
254 | cluster.KMeansClustering.__init__(self, data_sample, distance) | 248 | cluster.KMeansClustering.__init__(self, data_sample, distance) |
255 | self.distanceMatrix = {} | 249 | self.distanceMatrix = {} |
256 | for submission in self._KMeansClustering__data: | 250 | for submission in self._KMeansClustering__data: |
@@ -285,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): | @@ -285,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): | ||
285 | logging.debug("medoidDistance: %f" % medoidDistance) | 279 | logging.debug("medoidDistance: %f" % medoidDistance) |
286 | logging.debug("Cluster medoid: [%d] %s" % (medoid, | 280 | logging.debug("Cluster medoid: [%d] %s" % (medoid, |
287 | cluster[medoid].user_id)) | 281 | cluster[medoid].user_id)) |
288 | - return cluster[medoid] | 282 | + return (cluster[medoid],medoidDistance) |
289 | 283 | ||
290 | def assign_item(self, item, origin): | 284 | def assign_item(self, item, origin): |
291 | """ | 285 | """ |
@@ -293,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | @@ -293,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | ||
293 | """ | 287 | """ |
294 | closest_cluster = origin | 288 | closest_cluster = origin |
295 | for cluster in self._KMeansClustering__clusters: | 289 | for cluster in self._KMeansClustering__clusters: |
296 | - if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)): | 290 | + if self.distance(item,self.getMedoid(cluster)[0]) < \ |
291 | + self.distance(item,self.getMedoid(closest_cluster)[0]): | ||
297 | closest_cluster = cluster | 292 | closest_cluster = cluster |
298 | 293 | ||
299 | if closest_cluster != origin: | 294 | if closest_cluster != origin: |
@@ -307,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | @@ -307,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | ||
307 | """ | 302 | """ |
308 | Generate n clusters and return their medoids. | 303 | Generate n clusters and return their medoids. |
309 | """ | 304 | """ |
310 | - medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | ||
311 | - logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids]) | ||
312 | - return medoids | 305 | + medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)] |
306 | + medoids = [m[0] for m in medoids_distances] | ||
307 | + dispersion = sum([m[1] for m in medoids_distances]) | ||
308 | + logging.info("Clustering completed and the following medoids were found: %s" % [c.user_id for c in medoids]) | ||
309 | + return medoids,dispersion |