Commit c9e910a1211092d35b5ce500bb1b2b65a3ff8866

Authored by Tássia Camões Araújo
1 parent e70ddffd
Exists in master and in 1 other branch add_vagrant

Added max_popcon option and fixed bug with getting intergers values from config.

Showing 2 changed files with 36 additions and 21 deletions   Show diff stats
@@ -46,6 +46,7 @@ class Config(): @@ -46,6 +46,7 @@ class Config():
46 self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir") 46 self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir")
47 self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir") 47 self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir")
48 self.k_medoids = 100 48 self.k_medoids = 100
  49 + self.max_popcon = 1000
49 self.index_mode = "old" 50 self.index_mode = "old"
50 self.strategy = "cb" 51 self.strategy = "cb"
51 self.weight = "bm25" 52 self.weight = "bm25"
@@ -71,6 +72,7 @@ class Config(): @@ -71,6 +72,7 @@ class Config():
71 print " -u, --indexmode= 'old'|'reindex'|'cluster'|'recluster'" 72 print " -u, --indexmode= 'old'|'reindex'|'cluster'|'recluster'"
72 print " -l, --clustersdir=PATH Path to popcon clusters dir" 73 print " -l, --clustersdir=PATH Path to popcon clusters dir"
73 print " -c, --medoids=k Number of medoids for clustering" 74 print " -c, --medoids=k Number of medoids for clustering"
  75 + print " -x, --maxpopcon=k Number of submissions to be considered"
74 print "" 76 print ""
75 print " [ recommender ]" 77 print " [ recommender ]"
76 print " -w, --weight=OPTION Search weighting scheme" 78 print " -w, --weight=OPTION Search weighting scheme"
@@ -112,8 +114,8 @@ class Config(): @@ -112,8 +114,8 @@ class Config():
112 logging.error("Error in config file syntax: %s", str(err)) 114 logging.error("Error in config file syntax: %s", str(err))
113 os.abort() 115 os.abort()
114 116
115 - self.debug = self.read_option('general', 'debug')  
116 - self.debug = self.read_option('general', 'verbose') 117 + self.debug = int(self.read_option('general', 'debug'))
  118 + self.debug = int(self.read_option('general', 'verbose'))
117 self.output_filename = self.read_option('general', 'output') 119 self.output_filename = self.read_option('general', 'output')
118 self.survey_mode = self.read_option('general', 'survey_mode') 120 self.survey_mode = self.read_option('general', 'survey_mode')
119 121
@@ -123,16 +125,18 @@ class Config(): @@ -123,16 +125,18 @@ class Config():
123 self.popcon_dir = os.path.expanduser(self.read_option('data_sources', 'popcon_dir')) 125 self.popcon_dir = os.path.expanduser(self.read_option('data_sources', 'popcon_dir'))
124 self.index_mode = self.read_option('data_sources', 'index_mode') 126 self.index_mode = self.read_option('data_sources', 'index_mode')
125 self.clusters_dir = os.path.expanduser(self.read_option('data_sources', 'clusters_dir')) 127 self.clusters_dir = os.path.expanduser(self.read_option('data_sources', 'clusters_dir'))
126 - self.k_medoids = self.read_option('data_sources', 'k_medoids') 128 + self.k_medoids = int(self.read_option('data_sources', 'k_medoids'))
  129 + self.max_popcon = int(self.read_option('data_sources', 'max_popcon'))
127 130
128 self.weight = self.read_option('recommender', 'weight') 131 self.weight = self.read_option('recommender', 'weight')
129 self.strategy = self.read_option('recommender', 'strategy') 132 self.strategy = self.read_option('recommender', 'strategy')
130 - self.profile_size = self.read_option('recommender', 'profile_size') 133 + self.profile_size = int(self.read_option('recommender',
  134 + 'profile_size'))
131 135
132 - short_options = "hdvo:a:e:p:m:ul:c:w:s:z:" 136 + short_options = "hdvo:a:e:p:m:ul:c:x:w:s:z:"
133 long_options = ["help", "debug", "verbose", "output=", 137 long_options = ["help", "debug", "verbose", "output=",
134 "axi=", "dde=", "popconindex=", "popcondir=", "indexmode=", 138 "axi=", "dde=", "popconindex=", "popcondir=", "indexmode=",
135 - "clustersdir=", "kmedoids=", "weight=", "strategy=", 139 + "clustersdir=", "kmedoids=", "max_popcon=", "weight=", "strategy=",
136 "profile_size="] 140 "profile_size="]
137 try: 141 try:
138 opts, args = getopt.getopt(sys.argv[1:], short_options, 142 opts, args = getopt.getopt(sys.argv[1:], short_options,
@@ -166,13 +170,15 @@ class Config(): @@ -166,13 +170,15 @@ class Config():
166 elif o in ("-l", "--clustersdir"): 170 elif o in ("-l", "--clustersdir"):
167 self.clusters_dir = p 171 self.clusters_dir = p
168 elif o in ("-c", "--kmedoids"): 172 elif o in ("-c", "--kmedoids"):
169 - self.k_medoids = p 173 + self.k_medoids = int(p)
  174 + elif o in ("-x", "--max_popcon"):
  175 + self.max_popcon = int(p)
170 elif o in ("-w", "--weight"): 176 elif o in ("-w", "--weight"):
171 self.weight = p 177 self.weight = p
172 elif o in ("-s", "--strategy"): 178 elif o in ("-s", "--strategy"):
173 self.strategy = p 179 self.strategy = p
174 elif o in ("-z", "--profile_size"): 180 elif o in ("-z", "--profile_size"):
175 - self.strategy = p 181 + self.strategy = int(p)
176 else: 182 else:
177 assert False, "unhandled option" 183 assert False, "unhandled option"
178 184
@@ -82,7 +82,7 @@ class AppAptXapianIndex(xapian.WritableDatabase): @@ -82,7 +82,7 @@ class AppAptXapianIndex(xapian.WritableDatabase):
82 except: 82 except:
83 logging.info("Doc %d not found in axi." % docid) 83 logging.info("Doc %d not found in axi." % docid)
84 logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." % 84 logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." %
85 - self.get_doccount(), self.get_lastdocid()) 85 + (self.get_doccount(), self.get_lastdocid()))
86 86
87 def __str__(self): 87 def __str__(self):
88 return print_index(self) 88 return print_index(self)
@@ -166,6 +166,7 @@ class PopconXapianIndex(xapian.WritableDatabase): @@ -166,6 +166,7 @@ class PopconXapianIndex(xapian.WritableDatabase):
166 raise Error 166 raise Error
167 if cfg.index_mode == "reindex": 167 if cfg.index_mode == "reindex":
168 self.source_dir = os.path.expanduser(cfg.popcon_dir) 168 self.source_dir = os.path.expanduser(cfg.popcon_dir)
  169 + logging.debug(self.source_dir)
169 else: 170 else:
170 self.source_dir = os.path.expanduser(cfg.clusters_dir) 171 self.source_dir = os.path.expanduser(cfg.clusters_dir)
171 if not os.path.exists(cfg.clusters_dir): 172 if not os.path.exists(cfg.clusters_dir):
@@ -180,10 +181,12 @@ class PopconXapianIndex(xapian.WritableDatabase): @@ -180,10 +181,12 @@ class PopconXapianIndex(xapian.WritableDatabase):
180 % cfg.clusters_dir) 181 % cfg.clusters_dir)
181 distance = JaccardDistance() 182 distance = JaccardDistance()
182 data = self.get_submissions(cfg.popcon_dir) 183 data = self.get_submissions(cfg.popcon_dir)
  184 + logging.debug(type(data))
183 self.cluster_dispersion = \ 185 self.cluster_dispersion = \
184 self.kmedoids_clustering(data, cfg.clusters_dir, 186 self.kmedoids_clustering(data, cfg.clusters_dir,
185 - distance, cfg.k_medoids)  
186 - logging.info("Clusters dispersion: %f.2", 187 + distance, cfg.k_medoids,
  188 + cfg.max_popcon)
  189 + logging.info("Clusters dispersion: %.2f",
187 self.cluster_dispersion) 190 self.cluster_dispersion)
188 else: 191 else:
189 logging.info("Using clusters from \'%s\'" % 192 logging.info("Using clusters from \'%s\'" %
@@ -221,8 +224,9 @@ class PopconXapianIndex(xapian.WritableDatabase): @@ -221,8 +224,9 @@ class PopconXapianIndex(xapian.WritableDatabase):
221 self.path) 224 self.path)
222 xapian.WritableDatabase.__init__(self,self.path, 225 xapian.WritableDatabase.__init__(self,self.path,
223 xapian.DB_CREATE_OR_OVERWRITE) 226 xapian.DB_CREATE_OR_OVERWRITE)
224 - except xapian.DatabaseError: 227 + except xapian.DatabaseError as e:
225 logging.critical("Could not create popcon xapian index.") 228 logging.critical("Could not create popcon xapian index.")
  229 + logging.critical(str(e))
226 raise Error 230 raise Error
227 231
228 for root, dirs, files in os.walk(self.source_dir): 232 for root, dirs, files in os.walk(self.source_dir):
@@ -254,29 +258,32 @@ class PopconXapianIndex(xapian.WritableDatabase): @@ -254,29 +258,32 @@ class PopconXapianIndex(xapian.WritableDatabase):
254 submissions = [] 258 submissions = []
255 for root, dirs, files in os.walk(submissions_dir): 259 for root, dirs, files in os.walk(submissions_dir):
256 for popcon_file in files: 260 for popcon_file in files:
  261 + logging.debug("Parsing submission %s" % popcon_file)
257 submission = PopconSubmission(os.path.join(root, popcon_file)) 262 submission = PopconSubmission(os.path.join(root, popcon_file))
258 submissions.append(submission) 263 submissions.append(submission)
259 return submissions 264 return submissions
260 265
261 - def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): 266 + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids,max_popcon):
262 clusters = KMedoidsClustering(data,lambda x,y: 267 clusters = KMedoidsClustering(data,lambda x,y:
263 distance(x.packages.keys(), 268 distance(x.packages.keys(),
264 - y.packages.keys())) 269 + y.packages.keys()),max_popcon)
265 medoids,dispersion = clusters.getMedoids(k_medoids) 270 medoids,dispersion = clusters.getMedoids(k_medoids)
266 for submission in medoids: 271 for submission in medoids:
  272 + logging.debug("Copying submission %s" % submission.user_id)
267 shutil.copyfile(submission.path,os.path.join(clusters_dir, 273 shutil.copyfile(submission.path,os.path.join(clusters_dir,
268 submission.user_id)) 274 submission.user_id))
269 return dispersion 275 return dispersion
270 276
271 class KMedoidsClustering(cluster.KMeansClustering): 277 class KMedoidsClustering(cluster.KMeansClustering):
272 278
273 - def __init__(self,data,distance,max_data=100):  
274 - # if len(data)<max_data:  
275 - # data_sample = data  
276 - # else:  
277 - # data_sample = random.sample(data,max_data)  
278 - # cluster.KMeansClustering.__init__(self, data_sample, distance)  
279 - cluster.KMeansClustering.__init__(self, data, distance) 279 + def __init__(self,data,distance,max_data):
  280 + if len(data)<max_data:
  281 + data_sample = data
  282 + else:
  283 + data_sample = random.sample(data,max_data)
  284 + print data_sample
  285 + cluster.KMeansClustering.__init__(self, data_sample, distance)
  286 + # cluster.KMeansClustering.__init__(self, data, distance)
280 self.distanceMatrix = {} 287 self.distanceMatrix = {}
281 for submission in self._KMeansClustering__data: 288 for submission in self._KMeansClustering__data:
282 self.distanceMatrix[submission.user_id] = {} 289 self.distanceMatrix[submission.user_id] = {}
@@ -335,6 +342,8 @@ class KMedoidsClustering(cluster.KMeansClustering): @@ -335,6 +342,8 @@ class KMedoidsClustering(cluster.KMeansClustering):
335 """ 342 """
336 #medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)] 343 #medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)]
337 medoids_distances = [] 344 medoids_distances = []
  345 + logging.debug("initial length %s" % self._KMeansClustering__initial_length)
  346 + logging.debug("n %d" % n)
338 for cluster in self.getclusters(n): 347 for cluster in self.getclusters(n):
339 type(cluster) 348 type(cluster)
340 print cluster 349 print cluster