Commit b48d6eca8ef182ab26f13804f0ac48deed722f70

Authored by Tássia Camões Araújo
1 parent 5011c245
Exists in master and in 1 other branch add_vagrant

New index_mode scheme allow the following options: 'old', 'reindex', 'cluster' a…

…nd 'recluster'. Refactored k_medoids_clustering.
Showing 1 changed file with 30 additions and 33 deletions   Show diff stats
src/data.py
... ... @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase):
129 129 """
130 130 self.axi = xapian.Database(cfg.axi)
131 131 self.path = os.path.expanduser(cfg.popcon_index)
132   - if cfg.index_mode.startswith("1") or not self.load_index():
  132 + if not cfg.index_mode == "old" or not self.load_index():
133 133 if not os.path.exists(cfg.popcon_dir):
134 134 os.makedirs(cfg.popcon_dir)
135 135 if not os.listdir(cfg.popcon_dir):
136 136 logging.critical("Popcon dir seems to be empty.")
137 137 raise Error
138   - if cfg.index_mode == "10":
  138 + if cfg.index_mode == "reindex":
139 139 self.source_dir = os.path.expanduser(cfg.popcon_dir)
140 140 else:
141 141 self.source_dir = os.path.expanduser(cfg.clusters_dir)
142 142 if not os.path.exists(cfg.clusters_dir):
143 143 os.makedirs(cfg.clusters_dir)
144   - if not os.listdir(cfg.clusters_dir):
145   - distance = JaccardDistance()
  144 + if not os.listdir(cfg.clusters_dir) or \
  145 + cfg.index_mode == "recluster":
  146 + shutil.rmtree(cfg.clusters_dir,1)
  147 + os.makedirs(cfg.clusters_dir)
146 148 logging.info("Clustering popcon submissions from \'%s\'"
147 149 % cfg.popcon_dir)
148 150 logging.info("Clusters will be placed at \'%s\'"
149 151 % cfg.clusters_dir)
  152 + distance = JaccardDistance()
150 153 data = self.get_submissions(cfg.popcon_dir)
151   - if cfg.clustering == "Hierarchical":
152   - self.hierarchical_clustering(data,cfg.clusters_dir,
153   - distance)
154   - else:
155   - self.kmedoids_clustering(data,cfg.clusters_dir,
156   - distance)
  154 + self.cluster_dispersion = \
  155 + self.kmedoids_clustering(data, cfg.clusters_dir,
  156 + distance, cfg.k_medoids)
  157 + logging.info("Clusters dispersion: %f.2",
  158 + self.cluster_dispersion)
  159 + else:
  160 + logging.info("Using clusters from \'%s\'" %
  161 + cfg.clusters_dir)
157 162 self.build_index()
158 163  
159 164 def __str__(self):
... ... @@ -167,6 +172,7 @@ class PopconXapianIndex(xapian.WritableDatabase):
167 172 logging.info("Opening existing popcon xapian index at \'%s\'"
168 173 % self.path)
169 174 xapian.Database.__init__(self,self.path)
  175 + return 1
170 176 except xapian.DatabaseError:
171 177 logging.info("Could not open popcon index.")
172 178 return 0
... ... @@ -222,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase):
222 228 submissions.append(submission)
223 229 return submissions
224 230  
225   - def hierarchical_clustering(self,data,clusters_dir,distance,k=10):
226   - """
227   - Select popcon submissions from popcon_dir and place them at clusters_dir
228   - """
229   - cl = cluster.HierarchicalClustering(data, lambda x,y:
230   - distance(x.packages.keys(),
231   - y.packages.keys()))
232   - clusters = cl.getlevel(0.5)
233   - for c in clusters:
234   - print "cluster"
235   - for submission in c:
236   - print submission.user_id
237   -
238   - def kmedoids_clustering(self,data,clusters_dir,distance,k=10):
  231 + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids):
239 232 clusters = KMedoidsClustering(data,lambda x,y:
240 233 distance(x.packages.keys(),
241 234 y.packages.keys()))
242   - medoids = clusters.getMedoids(2)
  235 + medoids,dispersion = clusters.getMedoids(k_medoids)
243 236 for submission in medoids:
244 237 shutil.copyfile(submission.path,os.path.join(clusters_dir,
245 238 submission.user_id))
  239 + return dispersion
246 240  
247 241 class KMedoidsClustering(cluster.KMeansClustering):
248 242  
249   - def __init__(self,data,distance):
250   - if len(data)<100:
  243 + def __init__(self,data,distance,max_data=100):
  244 + if len(data)<max_data:
251 245 data_sample = data
252 246 else:
253   - data_sample = random.sample(data,100)
  247 + data_sample = random.sample(data,max_data)
254 248 cluster.KMeansClustering.__init__(self, data_sample, distance)
255 249 self.distanceMatrix = {}
256 250 for submission in self._KMeansClustering__data:
... ... @@ -285,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering):
285 279 logging.debug("medoidDistance: %f" % medoidDistance)
286 280 logging.debug("Cluster medoid: [%d] %s" % (medoid,
287 281 cluster[medoid].user_id))
288   - return cluster[medoid]
  282 + return (cluster[medoid],medoidDistance)
289 283  
290 284 def assign_item(self, item, origin):
291 285 """
... ... @@ -293,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering):
293 287 """
294 288 closest_cluster = origin
295 289 for cluster in self._KMeansClustering__clusters:
296   - if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)):
  290 + if self.distance(item,self.getMedoid(cluster)[0]) < \
  291 + self.distance(item,self.getMedoid(closest_cluster)[0]):
297 292 closest_cluster = cluster
298 293  
299 294 if closest_cluster != origin:
... ... @@ -307,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering):
307 302 """
308 303 Generate n clusters and return their medoids.
309 304 """
310   - medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)]
311   - logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids])
312   - return medoids
  305 + medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)]
  306 + medoids = [m[0] for m in medoids_distances]
  307 + dispersion = sum([m[1] for m in medoids_distances])
  308 + logging.info("Clustering completed and the following medoids were found: %s" % [c.user_id for c in medoids])
  309 + return medoids,dispersion
... ...