Commit b48d6eca8ef182ab26f13804f0ac48deed722f70

Authored by Tássia Camões Araújo
1 parent 5011c245
Exists in master and in 1 other branch add_vagrant

New index_mode scheme allow the following options: 'old', 'reindex', 'cluster' a…

…nd 'recluster'. Refactored k_medoids_clustering.
Showing 1 changed file with 30 additions and 33 deletions   Show diff stats
@@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase):
129 """ 129 """
130 self.axi = xapian.Database(cfg.axi) 130 self.axi = xapian.Database(cfg.axi)
131 self.path = os.path.expanduser(cfg.popcon_index) 131 self.path = os.path.expanduser(cfg.popcon_index)
132 - if cfg.index_mode.startswith("1") or not self.load_index(): 132 + if not cfg.index_mode == "old" or not self.load_index():
133 if not os.path.exists(cfg.popcon_dir): 133 if not os.path.exists(cfg.popcon_dir):
134 os.makedirs(cfg.popcon_dir) 134 os.makedirs(cfg.popcon_dir)
135 if not os.listdir(cfg.popcon_dir): 135 if not os.listdir(cfg.popcon_dir):
136 logging.critical("Popcon dir seems to be empty.") 136 logging.critical("Popcon dir seems to be empty.")
137 raise Error 137 raise Error
138 - if cfg.index_mode == "10": 138 + if cfg.index_mode == "reindex":
139 self.source_dir = os.path.expanduser(cfg.popcon_dir) 139 self.source_dir = os.path.expanduser(cfg.popcon_dir)
140 else: 140 else:
141 self.source_dir = os.path.expanduser(cfg.clusters_dir) 141 self.source_dir = os.path.expanduser(cfg.clusters_dir)
142 if not os.path.exists(cfg.clusters_dir): 142 if not os.path.exists(cfg.clusters_dir):
143 os.makedirs(cfg.clusters_dir) 143 os.makedirs(cfg.clusters_dir)
144 - if not os.listdir(cfg.clusters_dir):  
145 - distance = JaccardDistance() 144 + if not os.listdir(cfg.clusters_dir) or \
  145 + cfg.index_mode == "recluster":
  146 + shutil.rmtree(cfg.clusters_dir,1)
  147 + os.makedirs(cfg.clusters_dir)
146 logging.info("Clustering popcon submissions from \'%s\'" 148 logging.info("Clustering popcon submissions from \'%s\'"
147 % cfg.popcon_dir) 149 % cfg.popcon_dir)
148 logging.info("Clusters will be placed at \'%s\'" 150 logging.info("Clusters will be placed at \'%s\'"
149 % cfg.clusters_dir) 151 % cfg.clusters_dir)
  152 + distance = JaccardDistance()
150 data = self.get_submissions(cfg.popcon_dir) 153 data = self.get_submissions(cfg.popcon_dir)
151 - if cfg.clustering == "Hierarchical":  
152 - self.hierarchical_clustering(data,cfg.clusters_dir,  
153 - distance)  
154 - else:  
155 - self.kmedoids_clustering(data,cfg.clusters_dir,  
156 - distance) 154 + self.cluster_dispersion = \
  155 + self.kmedoids_clustering(data, cfg.clusters_dir,
  156 + distance, cfg.k_medoids)
  157 + logging.info("Clusters dispersion: %f.2",
  158 + self.cluster_dispersion)
  159 + else:
  160 + logging.info("Using clusters from \'%s\'" %
  161 + cfg.clusters_dir)
157 self.build_index() 162 self.build_index()
158 163
159 def __str__(self): 164 def __str__(self):
@@ -167,6 +172,7 @@ class PopconXapianIndex(xapian.WritableDatabase): @@ -167,6 +172,7 @@ class PopconXapianIndex(xapian.WritableDatabase):
167 logging.info("Opening existing popcon xapian index at \'%s\'" 172 logging.info("Opening existing popcon xapian index at \'%s\'"
168 % self.path) 173 % self.path)
169 xapian.Database.__init__(self,self.path) 174 xapian.Database.__init__(self,self.path)
  175 + return 1
170 except xapian.DatabaseError: 176 except xapian.DatabaseError:
171 logging.info("Could not open popcon index.") 177 logging.info("Could not open popcon index.")
172 return 0 178 return 0
@@ -222,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): @@ -222,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase):
222 submissions.append(submission) 228 submissions.append(submission)
223 return submissions 229 return submissions
224 230
225 - def hierarchical_clustering(self,data,clusters_dir,distance,k=10):  
226 - """  
227 - Select popcon submissions from popcon_dir and place them at clusters_dir  
228 - """  
229 - cl = cluster.HierarchicalClustering(data, lambda x,y:  
230 - distance(x.packages.keys(),  
231 - y.packages.keys()))  
232 - clusters = cl.getlevel(0.5)  
233 - for c in clusters:  
234 - print "cluster"  
235 - for submission in c:  
236 - print submission.user_id  
237 -  
238 - def kmedoids_clustering(self,data,clusters_dir,distance,k=10): 231 + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids):
239 clusters = KMedoidsClustering(data,lambda x,y: 232 clusters = KMedoidsClustering(data,lambda x,y:
240 distance(x.packages.keys(), 233 distance(x.packages.keys(),
241 y.packages.keys())) 234 y.packages.keys()))
242 - medoids = clusters.getMedoids(2) 235 + medoids,dispersion = clusters.getMedoids(k_medoids)
243 for submission in medoids: 236 for submission in medoids:
244 shutil.copyfile(submission.path,os.path.join(clusters_dir, 237 shutil.copyfile(submission.path,os.path.join(clusters_dir,
245 submission.user_id)) 238 submission.user_id))
  239 + return dispersion
246 240
247 class KMedoidsClustering(cluster.KMeansClustering): 241 class KMedoidsClustering(cluster.KMeansClustering):
248 242
249 - def __init__(self,data,distance):  
250 - if len(data)<100: 243 + def __init__(self,data,distance,max_data=100):
  244 + if len(data)<max_data:
251 data_sample = data 245 data_sample = data
252 else: 246 else:
253 - data_sample = random.sample(data,100) 247 + data_sample = random.sample(data,max_data)
254 cluster.KMeansClustering.__init__(self, data_sample, distance) 248 cluster.KMeansClustering.__init__(self, data_sample, distance)
255 self.distanceMatrix = {} 249 self.distanceMatrix = {}
256 for submission in self._KMeansClustering__data: 250 for submission in self._KMeansClustering__data:
@@ -285,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering): @@ -285,7 +279,7 @@ class KMedoidsClustering(cluster.KMeansClustering):
285 logging.debug("medoidDistance: %f" % medoidDistance) 279 logging.debug("medoidDistance: %f" % medoidDistance)
286 logging.debug("Cluster medoid: [%d] %s" % (medoid, 280 logging.debug("Cluster medoid: [%d] %s" % (medoid,
287 cluster[medoid].user_id)) 281 cluster[medoid].user_id))
288 - return cluster[medoid] 282 + return (cluster[medoid],medoidDistance)
289 283
290 def assign_item(self, item, origin): 284 def assign_item(self, item, origin):
291 """ 285 """
@@ -293,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering): @@ -293,7 +287,8 @@ class KMedoidsClustering(cluster.KMeansClustering):
293 """ 287 """
294 closest_cluster = origin 288 closest_cluster = origin
295 for cluster in self._KMeansClustering__clusters: 289 for cluster in self._KMeansClustering__clusters:
296 - if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)): 290 + if self.distance(item,self.getMedoid(cluster)[0]) < \
  291 + self.distance(item,self.getMedoid(closest_cluster)[0]):
297 closest_cluster = cluster 292 closest_cluster = cluster
298 293
299 if closest_cluster != origin: 294 if closest_cluster != origin:
@@ -307,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering): @@ -307,6 +302,8 @@ class KMedoidsClustering(cluster.KMeansClustering):
307 """ 302 """
308 Generate n clusters and return their medoids. 303 Generate n clusters and return their medoids.
309 """ 304 """
310 - medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)]  
311 - logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids])  
312 - return medoids 305 + medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)]
  306 + medoids = [m[0] for m in medoids_distances]
  307 + dispersion = sum([m[1] for m in medoids_distances])
  308 + logging.info("Clustering completed and the following medoids were found: %s" % [c.user_id for c in medoids])
  309 + return medoids,dispersion