Commit b48d6eca8ef182ab26f13804f0ac48deed722f70
1 parent
5011c245
Exists in
master
and in
1 other branch
New index_mode scheme allow the following options: 'old', 'reindex', 'cluster' a…
…nd 'recluster'. Refactored k_medoids_clustering.
Showing
1 changed file
with
30 additions
and
33 deletions
Show diff stats
src/data.py
... | ... | @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): |
129 | 129 | """ |
130 | 130 | self.axi = xapian.Database(cfg.axi) |
131 | 131 | self.path = os.path.expanduser(cfg.popcon_index) |
132 | - if cfg.index_mode.startswith("1") or not self.load_index(): | |
132 | + if not cfg.index_mode == "old" or not self.load_index(): | |
133 | 133 | if not os.path.exists(cfg.popcon_dir): |
134 | 134 | os.makedirs(cfg.popcon_dir) |
135 | 135 | if not os.listdir(cfg.popcon_dir): |
136 | 136 | logging.critical("Popcon dir seems to be empty.") |
137 | 137 | raise Error |
138 | - if cfg.index_mode == "10": | |
138 | + if cfg.index_mode == "reindex": | |
139 | 139 | self.source_dir = os.path.expanduser(cfg.popcon_dir) |
140 | 140 | else: |
141 | 141 | self.source_dir = os.path.expanduser(cfg.clusters_dir) |
142 | 142 | if not os.path.exists(cfg.clusters_dir): |
143 | 143 | os.makedirs(cfg.clusters_dir) |
144 | - if not os.listdir(cfg.clusters_dir): | |
145 | - distance = JaccardDistance() | |
144 | + if not os.listdir(cfg.clusters_dir) or \ | |
145 | + cfg.index_mode == "recluster": | |
146 | + shutil.rmtree(cfg.clusters_dir,1) | |
147 | + os.makedirs(cfg.clusters_dir) | |
146 | 148 | logging.info("Clustering popcon submissions from \'%s\'" |
147 | 149 | % cfg.popcon_dir) |
148 | 150 | logging.info("Clusters will be placed at \'%s\'" |
149 | 151 | % cfg.clusters_dir) |
152 | + distance = JaccardDistance() | |
150 | 153 | data = self.get_submissions(cfg.popcon_dir) |
151 | - if cfg.clustering == "Hierarchical": | |
152 | - self.hierarchical_clustering(data,cfg.clusters_dir, | |
153 | - distance) | |
154 | - else: | |
155 | - self.kmedoids_clustering(data,cfg.clusters_dir, | |
156 | - distance) | |
154 | + self.cluster_dispersion = \ | |
155 | + self.kmedoids_clustering(data, cfg.clusters_dir, | |
156 | + distance, cfg.k_medoids) | |
157 | + logging.info("Clusters dispersion: %f.2", | |
158 | + self.cluster_dispersion) | |
159 | + else: | |
160 | + logging.info("Using clusters from \'%s\'" % | |
161 | + cfg.clusters_dir) | |
157 | 162 | self.build_index() |
158 | 163 | |
159 | 164 | def __str__(self): |
... | ... | @@ -167,6 +172,7 @@ class PopconXapianIndex(xapian.WritableDatabase): |
167 | 172 | logging.info("Opening existing popcon xapian index at \'%s\'" |
168 | 173 | % self.path) |
169 | 174 | xapian.Database.__init__(self,self.path) |
175 | + return 1 | |
170 | 176 | except xapian.DatabaseError: |
171 | 177 | logging.info("Could not open popcon index.") |
172 | 178 | return 0 |
... | ... | @@ -222,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): |
222 | 228 | submissions.append(submission) |
223 | 229 | return submissions |
224 | 230 | |
225 | - def hierarchical_clustering(self,data,clusters_dir,distance,k=10): | |
226 | - """ | |
227 | - Select popcon submissions from popcon_dir and place them at clusters_dir | |
228 | - """ | |
229 | - cl = cluster.HierarchicalClustering(data, lambda x,y: | |
230 | - distance(x.packages.keys(), | |
231 | - y.packages.keys())) | |
232 | - clusters = cl.getlevel(0.5) | |
233 | - for c in clusters: | |
234 | - print "cluster" | |
235 | - for submission in c: | |
236 | - print submission.user_id | |
237 | - | |
238 | - def kmedoids_clustering(self,data,clusters_dir,distance,k=10): | |
231 | + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): | |
239 | 232 | clusters = KMedoidsClustering(data,lambda x,y: |
240 | 233 | distance(x.packages.keys(), |
241 | 234 | y.packages.keys())) |
242 | - medoids = clusters.getMedoids(2) | |
235 | + medoids,dispersion = clusters.getMedoids(k_medoids) | |
243 | 236 | for submission in medoids: |
244 | 237 | shutil.copyfile(submission.path,os.path.join(clusters_dir, |
245 | 238 | submission.user_id)) |
239 | + return dispersion | |
246 | 240 | |
247 | 241 | class KMedoidsClustering(cluster.KMeansClustering): |
248 | 242 | |
249 | - def __init__(self,data,distance): | |
250 | - if len(data)<100: | |
243 | + def __init__(self,data,distance,max_data=100): | |
244 | + if len(data)<max_data: | |
251 | 245 | data_sample = data |
252 | 246 | else: |
253 | - data_sample = random.sample(data,100) | |
247 | + data_sample = random.sample(data,max_data) | |
254 | 248 | cluster.KMeansClustering.__init__(self, data_sample, distance) |
255 | 249 | self.distanceMatrix = {} |
256 | 250 | for submission in self._KMeansClustering__data: |
... | ... | @@ -285,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): |
285 | 279 | logging.debug("medoidDistance: %f" % medoidDistance) |
286 | 280 | logging.debug("Cluster medoid: [%d] %s" % (medoid, |
287 | 281 | cluster[medoid].user_id)) |
288 | - return cluster[medoid] | |
282 | + return (cluster[medoid],medoidDistance) | |
289 | 283 | |
290 | 284 | def assign_item(self, item, origin): |
291 | 285 | """ |
... | ... | @@ -293,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): |
293 | 287 | """ |
294 | 288 | closest_cluster = origin |
295 | 289 | for cluster in self._KMeansClustering__clusters: |
296 | - if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)): | |
290 | + if self.distance(item,self.getMedoid(cluster)[0]) < \ | |
291 | + self.distance(item,self.getMedoid(closest_cluster)[0]): | |
297 | 292 | closest_cluster = cluster |
298 | 293 | |
299 | 294 | if closest_cluster != origin: |
... | ... | @@ -307,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): |
307 | 302 | """ |
308 | 303 | Generate n clusters and return their medoids. |
309 | 304 | """ |
310 | - medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | |
311 | - logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids]) | |
312 | - return medoids | |
305 | + medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | |
306 | + medoids = [m[0] for m in medoids_distances] | |
307 | + dispersion = sum([m[1] for m in medoids_distances]) | |
308 | + logging.info("Clustering completed and the following medoids were found: %s" % [c.user_id for c in medoids]) | |
309 | + return medoids,dispersion | ... | ... |