Commit c9e910a1211092d35b5ce500bb1b2b65a3ff8866
1 parent
e70ddffd
Exists in
master
and in
1 other branch
Added max_popcon option and fixed bug with getting intergers values from config.
Showing
2 changed files
with
36 additions
and
21 deletions
Show diff stats
src/config.py
| @@ -46,6 +46,7 @@ class Config(): | @@ -46,6 +46,7 @@ class Config(): | ||
| 46 | self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir") | 46 | self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir") |
| 47 | self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir") | 47 | self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir") |
| 48 | self.k_medoids = 100 | 48 | self.k_medoids = 100 |
| 49 | + self.max_popcon = 1000 | ||
| 49 | self.index_mode = "old" | 50 | self.index_mode = "old" |
| 50 | self.strategy = "cb" | 51 | self.strategy = "cb" |
| 51 | self.weight = "bm25" | 52 | self.weight = "bm25" |
| @@ -71,6 +72,7 @@ class Config(): | @@ -71,6 +72,7 @@ class Config(): | ||
| 71 | print " -u, --indexmode= 'old'|'reindex'|'cluster'|'recluster'" | 72 | print " -u, --indexmode= 'old'|'reindex'|'cluster'|'recluster'" |
| 72 | print " -l, --clustersdir=PATH Path to popcon clusters dir" | 73 | print " -l, --clustersdir=PATH Path to popcon clusters dir" |
| 73 | print " -c, --medoids=k Number of medoids for clustering" | 74 | print " -c, --medoids=k Number of medoids for clustering" |
| 75 | + print " -x, --maxpopcon=k Number of submissions to be considered" | ||
| 74 | print "" | 76 | print "" |
| 75 | print " [ recommender ]" | 77 | print " [ recommender ]" |
| 76 | print " -w, --weight=OPTION Search weighting scheme" | 78 | print " -w, --weight=OPTION Search weighting scheme" |
| @@ -112,8 +114,8 @@ class Config(): | @@ -112,8 +114,8 @@ class Config(): | ||
| 112 | logging.error("Error in config file syntax: %s", str(err)) | 114 | logging.error("Error in config file syntax: %s", str(err)) |
| 113 | os.abort() | 115 | os.abort() |
| 114 | 116 | ||
| 115 | - self.debug = self.read_option('general', 'debug') | ||
| 116 | - self.debug = self.read_option('general', 'verbose') | 117 | + self.debug = int(self.read_option('general', 'debug')) |
| 118 | + self.debug = int(self.read_option('general', 'verbose')) | ||
| 117 | self.output_filename = self.read_option('general', 'output') | 119 | self.output_filename = self.read_option('general', 'output') |
| 118 | self.survey_mode = self.read_option('general', 'survey_mode') | 120 | self.survey_mode = self.read_option('general', 'survey_mode') |
| 119 | 121 | ||
| @@ -123,16 +125,18 @@ class Config(): | @@ -123,16 +125,18 @@ class Config(): | ||
| 123 | self.popcon_dir = os.path.expanduser(self.read_option('data_sources', 'popcon_dir')) | 125 | self.popcon_dir = os.path.expanduser(self.read_option('data_sources', 'popcon_dir')) |
| 124 | self.index_mode = self.read_option('data_sources', 'index_mode') | 126 | self.index_mode = self.read_option('data_sources', 'index_mode') |
| 125 | self.clusters_dir = os.path.expanduser(self.read_option('data_sources', 'clusters_dir')) | 127 | self.clusters_dir = os.path.expanduser(self.read_option('data_sources', 'clusters_dir')) |
| 126 | - self.k_medoids = self.read_option('data_sources', 'k_medoids') | 128 | + self.k_medoids = int(self.read_option('data_sources', 'k_medoids')) |
| 129 | + self.max_popcon = int(self.read_option('data_sources', 'max_popcon')) | ||
| 127 | 130 | ||
| 128 | self.weight = self.read_option('recommender', 'weight') | 131 | self.weight = self.read_option('recommender', 'weight') |
| 129 | self.strategy = self.read_option('recommender', 'strategy') | 132 | self.strategy = self.read_option('recommender', 'strategy') |
| 130 | - self.profile_size = self.read_option('recommender', 'profile_size') | 133 | + self.profile_size = int(self.read_option('recommender', |
| 134 | + 'profile_size')) | ||
| 131 | 135 | ||
| 132 | - short_options = "hdvo:a:e:p:m:ul:c:w:s:z:" | 136 | + short_options = "hdvo:a:e:p:m:ul:c:x:w:s:z:" |
| 133 | long_options = ["help", "debug", "verbose", "output=", | 137 | long_options = ["help", "debug", "verbose", "output=", |
| 134 | "axi=", "dde=", "popconindex=", "popcondir=", "indexmode=", | 138 | "axi=", "dde=", "popconindex=", "popcondir=", "indexmode=", |
| 135 | - "clustersdir=", "kmedoids=", "weight=", "strategy=", | 139 | + "clustersdir=", "kmedoids=", "max_popcon=", "weight=", "strategy=", |
| 136 | "profile_size="] | 140 | "profile_size="] |
| 137 | try: | 141 | try: |
| 138 | opts, args = getopt.getopt(sys.argv[1:], short_options, | 142 | opts, args = getopt.getopt(sys.argv[1:], short_options, |
| @@ -166,13 +170,15 @@ class Config(): | @@ -166,13 +170,15 @@ class Config(): | ||
| 166 | elif o in ("-l", "--clustersdir"): | 170 | elif o in ("-l", "--clustersdir"): |
| 167 | self.clusters_dir = p | 171 | self.clusters_dir = p |
| 168 | elif o in ("-c", "--kmedoids"): | 172 | elif o in ("-c", "--kmedoids"): |
| 169 | - self.k_medoids = p | 173 | + self.k_medoids = int(p) |
| 174 | + elif o in ("-x", "--max_popcon"): | ||
| 175 | + self.max_popcon = int(p) | ||
| 170 | elif o in ("-w", "--weight"): | 176 | elif o in ("-w", "--weight"): |
| 171 | self.weight = p | 177 | self.weight = p |
| 172 | elif o in ("-s", "--strategy"): | 178 | elif o in ("-s", "--strategy"): |
| 173 | self.strategy = p | 179 | self.strategy = p |
| 174 | elif o in ("-z", "--profile_size"): | 180 | elif o in ("-z", "--profile_size"): |
| 175 | - self.strategy = p | 181 | + self.strategy = int(p) |
| 176 | else: | 182 | else: |
| 177 | assert False, "unhandled option" | 183 | assert False, "unhandled option" |
| 178 | 184 |
src/data.py
| @@ -82,7 +82,7 @@ class AppAptXapianIndex(xapian.WritableDatabase): | @@ -82,7 +82,7 @@ class AppAptXapianIndex(xapian.WritableDatabase): | ||
| 82 | except: | 82 | except: |
| 83 | logging.info("Doc %d not found in axi." % docid) | 83 | logging.info("Doc %d not found in axi." % docid) |
| 84 | logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." % | 84 | logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." % |
| 85 | - self.get_doccount(), self.get_lastdocid()) | 85 | + (self.get_doccount(), self.get_lastdocid())) |
| 86 | 86 | ||
| 87 | def __str__(self): | 87 | def __str__(self): |
| 88 | return print_index(self) | 88 | return print_index(self) |
| @@ -166,6 +166,7 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -166,6 +166,7 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
| 166 | raise Error | 166 | raise Error |
| 167 | if cfg.index_mode == "reindex": | 167 | if cfg.index_mode == "reindex": |
| 168 | self.source_dir = os.path.expanduser(cfg.popcon_dir) | 168 | self.source_dir = os.path.expanduser(cfg.popcon_dir) |
| 169 | + logging.debug(self.source_dir) | ||
| 169 | else: | 170 | else: |
| 170 | self.source_dir = os.path.expanduser(cfg.clusters_dir) | 171 | self.source_dir = os.path.expanduser(cfg.clusters_dir) |
| 171 | if not os.path.exists(cfg.clusters_dir): | 172 | if not os.path.exists(cfg.clusters_dir): |
| @@ -180,10 +181,12 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -180,10 +181,12 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
| 180 | % cfg.clusters_dir) | 181 | % cfg.clusters_dir) |
| 181 | distance = JaccardDistance() | 182 | distance = JaccardDistance() |
| 182 | data = self.get_submissions(cfg.popcon_dir) | 183 | data = self.get_submissions(cfg.popcon_dir) |
| 184 | + logging.debug(type(data)) | ||
| 183 | self.cluster_dispersion = \ | 185 | self.cluster_dispersion = \ |
| 184 | self.kmedoids_clustering(data, cfg.clusters_dir, | 186 | self.kmedoids_clustering(data, cfg.clusters_dir, |
| 185 | - distance, cfg.k_medoids) | ||
| 186 | - logging.info("Clusters dispersion: %f.2", | 187 | + distance, cfg.k_medoids, |
| 188 | + cfg.max_popcon) | ||
| 189 | + logging.info("Clusters dispersion: %.2f", | ||
| 187 | self.cluster_dispersion) | 190 | self.cluster_dispersion) |
| 188 | else: | 191 | else: |
| 189 | logging.info("Using clusters from \'%s\'" % | 192 | logging.info("Using clusters from \'%s\'" % |
| @@ -221,8 +224,9 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -221,8 +224,9 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
| 221 | self.path) | 224 | self.path) |
| 222 | xapian.WritableDatabase.__init__(self,self.path, | 225 | xapian.WritableDatabase.__init__(self,self.path, |
| 223 | xapian.DB_CREATE_OR_OVERWRITE) | 226 | xapian.DB_CREATE_OR_OVERWRITE) |
| 224 | - except xapian.DatabaseError: | 227 | + except xapian.DatabaseError as e: |
| 225 | logging.critical("Could not create popcon xapian index.") | 228 | logging.critical("Could not create popcon xapian index.") |
| 229 | + logging.critical(str(e)) | ||
| 226 | raise Error | 230 | raise Error |
| 227 | 231 | ||
| 228 | for root, dirs, files in os.walk(self.source_dir): | 232 | for root, dirs, files in os.walk(self.source_dir): |
| @@ -254,29 +258,32 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -254,29 +258,32 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
| 254 | submissions = [] | 258 | submissions = [] |
| 255 | for root, dirs, files in os.walk(submissions_dir): | 259 | for root, dirs, files in os.walk(submissions_dir): |
| 256 | for popcon_file in files: | 260 | for popcon_file in files: |
| 261 | + logging.debug("Parsing submission %s" % popcon_file) | ||
| 257 | submission = PopconSubmission(os.path.join(root, popcon_file)) | 262 | submission = PopconSubmission(os.path.join(root, popcon_file)) |
| 258 | submissions.append(submission) | 263 | submissions.append(submission) |
| 259 | return submissions | 264 | return submissions |
| 260 | 265 | ||
| 261 | - def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): | 266 | + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids,max_popcon): |
| 262 | clusters = KMedoidsClustering(data,lambda x,y: | 267 | clusters = KMedoidsClustering(data,lambda x,y: |
| 263 | distance(x.packages.keys(), | 268 | distance(x.packages.keys(), |
| 264 | - y.packages.keys())) | 269 | + y.packages.keys()),max_popcon) |
| 265 | medoids,dispersion = clusters.getMedoids(k_medoids) | 270 | medoids,dispersion = clusters.getMedoids(k_medoids) |
| 266 | for submission in medoids: | 271 | for submission in medoids: |
| 272 | + logging.debug("Copying submission %s" % submission.user_id) | ||
| 267 | shutil.copyfile(submission.path,os.path.join(clusters_dir, | 273 | shutil.copyfile(submission.path,os.path.join(clusters_dir, |
| 268 | submission.user_id)) | 274 | submission.user_id)) |
| 269 | return dispersion | 275 | return dispersion |
| 270 | 276 | ||
| 271 | class KMedoidsClustering(cluster.KMeansClustering): | 277 | class KMedoidsClustering(cluster.KMeansClustering): |
| 272 | 278 | ||
| 273 | - def __init__(self,data,distance,max_data=100): | ||
| 274 | - # if len(data)<max_data: | ||
| 275 | - # data_sample = data | ||
| 276 | - # else: | ||
| 277 | - # data_sample = random.sample(data,max_data) | ||
| 278 | - # cluster.KMeansClustering.__init__(self, data_sample, distance) | ||
| 279 | - cluster.KMeansClustering.__init__(self, data, distance) | 279 | + def __init__(self,data,distance,max_data): |
| 280 | + if len(data)<max_data: | ||
| 281 | + data_sample = data | ||
| 282 | + else: | ||
| 283 | + data_sample = random.sample(data,max_data) | ||
| 284 | + print data_sample | ||
| 285 | + cluster.KMeansClustering.__init__(self, data_sample, distance) | ||
| 286 | + # cluster.KMeansClustering.__init__(self, data, distance) | ||
| 280 | self.distanceMatrix = {} | 287 | self.distanceMatrix = {} |
| 281 | for submission in self._KMeansClustering__data: | 288 | for submission in self._KMeansClustering__data: |
| 282 | self.distanceMatrix[submission.user_id] = {} | 289 | self.distanceMatrix[submission.user_id] = {} |
| @@ -335,6 +342,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | @@ -335,6 +342,8 @@ class KMedoidsClustering(cluster.KMeansClustering): | ||
| 335 | """ | 342 | """ |
| 336 | #medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | 343 | #medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)] |
| 337 | medoids_distances = [] | 344 | medoids_distances = [] |
| 345 | + logging.debug("initial length %s" % self._KMeansClustering__initial_length) | ||
| 346 | + logging.debug("n %d" % n) | ||
| 338 | for cluster in self.getclusters(n): | 347 | for cluster in self.getclusters(n): |
| 339 | type(cluster) | 348 | type(cluster) |
| 340 | print cluster | 349 | print cluster |