From b48d6eca8ef182ab26f13804f0ac48deed722f70 Mon Sep 17 00:00:00 2001 From: Tássia Camões Araújo Date: Fri, 15 Jul 2011 05:14:38 -0300 Subject: [PATCH] New index_mode scheme allow the following options: 'old', 'reindex', 'cluster' and 'recluster'. Refactored k_medoids_clustering. --- src/data.py | 63 ++++++++++++++++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/src/data.py b/src/data.py index 728127b..37c1593 100644 --- a/src/data.py +++ b/src/data.py @@ -129,31 +129,36 @@ class PopconXapianIndex(xapian.WritableDatabase): """ self.axi = xapian.Database(cfg.axi) self.path = os.path.expanduser(cfg.popcon_index) - if cfg.index_mode.startswith("1") or not self.load_index(): + if not cfg.index_mode == "old" or not self.load_index(): if not os.path.exists(cfg.popcon_dir): os.makedirs(cfg.popcon_dir) if not os.listdir(cfg.popcon_dir): logging.critical("Popcon dir seems to be empty.") raise Error - if cfg.index_mode == "10": + if cfg.index_mode == "reindex": self.source_dir = os.path.expanduser(cfg.popcon_dir) else: self.source_dir = os.path.expanduser(cfg.clusters_dir) if not os.path.exists(cfg.clusters_dir): os.makedirs(cfg.clusters_dir) - if not os.listdir(cfg.clusters_dir): - distance = JaccardDistance() + if not os.listdir(cfg.clusters_dir) or \ + cfg.index_mode == "recluster": + shutil.rmtree(cfg.clusters_dir,1) + os.makedirs(cfg.clusters_dir) logging.info("Clustering popcon submissions from \'%s\'" % cfg.popcon_dir) logging.info("Clusters will be placed at \'%s\'" % cfg.clusters_dir) + distance = JaccardDistance() data = self.get_submissions(cfg.popcon_dir) - if cfg.clustering == "Hierarchical": - self.hierarchical_clustering(data,cfg.clusters_dir, - distance) - else: - self.kmedoids_clustering(data,cfg.clusters_dir, - distance) + self.cluster_dispersion = \ + self.kmedoids_clustering(data, cfg.clusters_dir, + distance, cfg.k_medoids) + logging.info("Clusters dispersion: %f.2", + self.cluster_dispersion) + else: + logging.info("Using clusters from \'%s\'" % + cfg.clusters_dir) self.build_index() def __str__(self): @@ -167,6 +172,7 @@ class PopconXapianIndex(xapian.WritableDatabase): logging.info("Opening existing popcon xapian index at \'%s\'" % self.path) xapian.Database.__init__(self,self.path) + return 1 except xapian.DatabaseError: logging.info("Could not open popcon index.") return 0 @@ -222,35 +228,23 @@ class PopconXapianIndex(xapian.WritableDatabase): submissions.append(submission) return submissions - def hierarchical_clustering(self,data,clusters_dir,distance,k=10): - """ - Select popcon submissions from popcon_dir and place them at clusters_dir - """ - cl = cluster.HierarchicalClustering(data, lambda x,y: - distance(x.packages.keys(), - y.packages.keys())) - clusters = cl.getlevel(0.5) - for c in clusters: - print "cluster" - for submission in c: - print submission.user_id - - def kmedoids_clustering(self,data,clusters_dir,distance,k=10): + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): clusters = KMedoidsClustering(data,lambda x,y: distance(x.packages.keys(), y.packages.keys())) - medoids = clusters.getMedoids(2) + medoids,dispersion = clusters.getMedoids(k_medoids) for submission in medoids: shutil.copyfile(submission.path,os.path.join(clusters_dir, submission.user_id)) + return dispersion class KMedoidsClustering(cluster.KMeansClustering): - def __init__(self,data,distance): - if len(data)<100: + def __init__(self,data,distance,max_data=100): + if len(data)