From c9e910a1211092d35b5ce500bb1b2b65a3ff8866 Mon Sep 17 00:00:00 2001 From: Tássia Camões Araújo Date: Wed, 27 Jul 2011 19:10:44 +0000 Subject: [PATCH] Added max_popcon option and fixed bug with getting intergers values from config. --- src/config.py | 22 ++++++++++++++-------- src/data.py | 35 ++++++++++++++++++++++------------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/src/config.py b/src/config.py index a3bf63c..31b01d5 100644 --- a/src/config.py +++ b/src/config.py @@ -46,6 +46,7 @@ class Config(): self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir") self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir") self.k_medoids = 100 + self.max_popcon = 1000 self.index_mode = "old" self.strategy = "cb" self.weight = "bm25" @@ -71,6 +72,7 @@ class Config(): print " -u, --indexmode= 'old'|'reindex'|'cluster'|'recluster'" print " -l, --clustersdir=PATH Path to popcon clusters dir" print " -c, --medoids=k Number of medoids for clustering" + print " -x, --maxpopcon=k Number of submissions to be considered" print "" print " [ recommender ]" print " -w, --weight=OPTION Search weighting scheme" @@ -112,8 +114,8 @@ class Config(): logging.error("Error in config file syntax: %s", str(err)) os.abort() - self.debug = self.read_option('general', 'debug') - self.debug = self.read_option('general', 'verbose') + self.debug = int(self.read_option('general', 'debug')) + self.debug = int(self.read_option('general', 'verbose')) self.output_filename = self.read_option('general', 'output') self.survey_mode = self.read_option('general', 'survey_mode') @@ -123,16 +125,18 @@ class Config(): self.popcon_dir = os.path.expanduser(self.read_option('data_sources', 'popcon_dir')) self.index_mode = self.read_option('data_sources', 'index_mode') self.clusters_dir = os.path.expanduser(self.read_option('data_sources', 'clusters_dir')) - self.k_medoids = self.read_option('data_sources', 'k_medoids') + self.k_medoids = int(self.read_option('data_sources', 'k_medoids')) + self.max_popcon = int(self.read_option('data_sources', 'max_popcon')) self.weight = self.read_option('recommender', 'weight') self.strategy = self.read_option('recommender', 'strategy') - self.profile_size = self.read_option('recommender', 'profile_size') + self.profile_size = int(self.read_option('recommender', + 'profile_size')) - short_options = "hdvo:a:e:p:m:ul:c:w:s:z:" + short_options = "hdvo:a:e:p:m:ul:c:x:w:s:z:" long_options = ["help", "debug", "verbose", "output=", "axi=", "dde=", "popconindex=", "popcondir=", "indexmode=", - "clustersdir=", "kmedoids=", "weight=", "strategy=", + "clustersdir=", "kmedoids=", "max_popcon=", "weight=", "strategy=", "profile_size="] try: opts, args = getopt.getopt(sys.argv[1:], short_options, @@ -166,13 +170,15 @@ class Config(): elif o in ("-l", "--clustersdir"): self.clusters_dir = p elif o in ("-c", "--kmedoids"): - self.k_medoids = p + self.k_medoids = int(p) + elif o in ("-x", "--max_popcon"): + self.max_popcon = int(p) elif o in ("-w", "--weight"): self.weight = p elif o in ("-s", "--strategy"): self.strategy = p elif o in ("-z", "--profile_size"): - self.strategy = p + self.strategy = int(p) else: assert False, "unhandled option" diff --git a/src/data.py b/src/data.py index c883d65..c9b88c0 100644 --- a/src/data.py +++ b/src/data.py @@ -82,7 +82,7 @@ class AppAptXapianIndex(xapian.WritableDatabase): except: logging.info("Doc %d not found in axi." % docid) logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." % - self.get_doccount(), self.get_lastdocid()) + (self.get_doccount(), self.get_lastdocid())) def __str__(self): return print_index(self) @@ -166,6 +166,7 @@ class PopconXapianIndex(xapian.WritableDatabase): raise Error if cfg.index_mode == "reindex": self.source_dir = os.path.expanduser(cfg.popcon_dir) + logging.debug(self.source_dir) else: self.source_dir = os.path.expanduser(cfg.clusters_dir) if not os.path.exists(cfg.clusters_dir): @@ -180,10 +181,12 @@ class PopconXapianIndex(xapian.WritableDatabase): % cfg.clusters_dir) distance = JaccardDistance() data = self.get_submissions(cfg.popcon_dir) + logging.debug(type(data)) self.cluster_dispersion = \ self.kmedoids_clustering(data, cfg.clusters_dir, - distance, cfg.k_medoids) - logging.info("Clusters dispersion: %f.2", + distance, cfg.k_medoids, + cfg.max_popcon) + logging.info("Clusters dispersion: %.2f", self.cluster_dispersion) else: logging.info("Using clusters from \'%s\'" % @@ -221,8 +224,9 @@ class PopconXapianIndex(xapian.WritableDatabase): self.path) xapian.WritableDatabase.__init__(self,self.path, xapian.DB_CREATE_OR_OVERWRITE) - except xapian.DatabaseError: + except xapian.DatabaseError as e: logging.critical("Could not create popcon xapian index.") + logging.critical(str(e)) raise Error for root, dirs, files in os.walk(self.source_dir): @@ -254,29 +258,32 @@ class PopconXapianIndex(xapian.WritableDatabase): submissions = [] for root, dirs, files in os.walk(submissions_dir): for popcon_file in files: + logging.debug("Parsing submission %s" % popcon_file) submission = PopconSubmission(os.path.join(root, popcon_file)) submissions.append(submission) return submissions - def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids,max_popcon): clusters = KMedoidsClustering(data,lambda x,y: distance(x.packages.keys(), - y.packages.keys())) + y.packages.keys()),max_popcon) medoids,dispersion = clusters.getMedoids(k_medoids) for submission in medoids: + logging.debug("Copying submission %s" % submission.user_id) shutil.copyfile(submission.path,os.path.join(clusters_dir, submission.user_id)) return dispersion class KMedoidsClustering(cluster.KMeansClustering): - def __init__(self,data,distance,max_data=100): - # if len(data)