Commit 353b42add083cccd2d18764ef0ec6e6b6b1878b6
1 parent
37e376c1
Exists in
master
and in
1 other branch
Data classes complete refactoring.
Showing
1 changed file
with
134 additions
and
171 deletions
Show diff stats
src/data.py
... | ... | @@ -22,17 +22,14 @@ __license__ = """ |
22 | 22 | import os |
23 | 23 | import sys |
24 | 24 | import gc |
25 | -import re | |
26 | 25 | import xapian |
27 | -import axi | |
28 | -from debian import debtags | |
29 | 26 | import logging |
30 | -import hashlib | |
31 | 27 | import random |
28 | +import cluster | |
29 | +import shutil | |
32 | 30 | |
33 | 31 | from error import Error |
34 | 32 | from singleton import Singleton |
35 | -import cluster | |
36 | 33 | from dissimilarity import * |
37 | 34 | |
38 | 35 | def axi_search_pkgs(axi,pkgs_list): |
... | ... | @@ -53,101 +50,114 @@ def axi_search_pkg_tags(axi,pkg): |
53 | 50 | term.term.startswith("XT")] |
54 | 51 | return tags |
55 | 52 | |
53 | +def print_index(index): | |
54 | + output = "\n---\n" + xapian.Database.__repr__(index) + "\n---\n" | |
55 | + for term in index.allterms(): | |
56 | + output += term.term+"\n" | |
57 | + output += str([index.get_document(posting.docid).get_data() | |
58 | + for posting in index.postlist(term.term)]) | |
59 | + output += "\n---" | |
60 | + return output | |
61 | + | |
56 | 62 | class SampleAptXapianIndex(xapian.WritableDatabase): |
57 | 63 | """ |
58 | 64 | Sample data source for packages information, mainly useful for tests. |
59 | 65 | """ |
60 | - def __init__(self,pkgs_list,axi): | |
61 | - xapian.WritableDatabase.__init__(self,".sample_axi", | |
66 | + def __init__(self,pkgs_list,axi,path): | |
67 | + xapian.WritableDatabase.__init__(self,path, | |
62 | 68 | xapian.DB_CREATE_OR_OVERWRITE) |
63 | 69 | sample = axi_search_pkgs(axi,pkgs_list) |
64 | - self.all_docs = [] | |
65 | 70 | for package in sample: |
66 | 71 | doc_id = self.add_document(axi.get_document(package.docid)) |
67 | - self.all_docs.append(doc_id) | |
68 | 72 | |
69 | - def _print(self): | |
70 | - print "---" | |
71 | - print xapian.WritableDatabase.__repr__(self) | |
72 | - print "---" | |
73 | - for doc_id in self.all_docs: | |
74 | - print [term.term for term in self.get_document(doc_id).termlist()] | |
75 | - print "---" | |
73 | + def __str__(self): | |
74 | + return print_index(self) | |
76 | 75 | |
77 | 76 | class PopconSubmission(): |
78 | - def __init__(self,submission_hash): | |
79 | - self.hash = submission_hash | |
80 | - self.pkgs_list = [] | |
77 | + def __init__(self,path,user_id=0): | |
78 | + self.packages = dict() | |
79 | + self.path = path | |
80 | + self.load() | |
81 | + if user_id: | |
82 | + self.user_id = user_id | |
81 | 83 | |
82 | - def add_pkg(self,pkg): | |
83 | - self.pkgs_list.append(pkg) | |
84 | + def __str__(self): | |
85 | + output = "\nPopularity-contest submission ID "+self.user_id | |
86 | + for pkg, weight in self.packages.items(): | |
87 | + output += "\n "+pkg+": "+str(weight) | |
88 | + return output | |
84 | 89 | |
85 | - def parse_submission(self,submission_path,binary=1): | |
90 | + def load(self,binary=1): | |
86 | 91 | """ |
87 | 92 | Parse a popcon submission, generating the names of the valid packages |
88 | 93 | in the vote. |
89 | 94 | """ |
90 | - submission = open(submission_path) | |
91 | - for line in submission: | |
92 | - if not line.startswith("POPULARITY"): | |
93 | - if not line.startswith("END-POPULARITY"): | |
94 | - data = line[:-1].split(" ") | |
95 | - if len(data) > 3: | |
96 | - if binary: | |
97 | - # every installed package has the same weight | |
98 | - yield data[2], 1 | |
99 | - elif data[3] == '<NOFILES>': | |
95 | + with open(self.path) as submission: | |
96 | + for line in submission: | |
97 | + if line.startswith("POPULARITY"): | |
98 | + self.user_id = line.split()[2].lstrip("ID:") | |
99 | + elif not line.startswith("END-POPULARITY"): | |
100 | + data = line.rstrip('\n').split() | |
101 | + if len(data) > 2: | |
102 | + pkg = data[2] | |
103 | + if len(data) > 3: | |
104 | + exec_file = data[3] | |
105 | + # Binary weight | |
106 | + if binary: | |
107 | + self.packages[pkg] = 1 | |
108 | + # Weights inherited from Enrico's anapop | |
100 | 109 | # No executable files to track |
101 | - yield data[2], 1 | |
102 | - elif len(data) == 4: | |
110 | + elif exec_file == '<NOFILES>': | |
111 | + self.packages[pkg] = 1 | |
103 | 112 | # Recently used packages |
104 | - yield data[2], 10 | |
105 | - elif data[4] == '<OLD>': | |
113 | + elif len(data) == 4: | |
114 | + self.packages[pkg] = 10 | |
106 | 115 | # Unused packages |
107 | - yield data[2], 3 | |
108 | - elif data[4] == '<RECENT-CTIME>': | |
116 | + elif data[4] == '<OLD>': | |
117 | + self.packages[pkg] = 3 | |
109 | 118 | # Recently installed packages |
110 | - yield data[2], 8 | |
111 | -class PopconXapianIndex(xapian.WritableDatabase,Singleton): | |
119 | + elif data[4] == '<RECENT-CTIME>': | |
120 | + self.packages[pkg] = 8 | |
121 | + | |
122 | +class PopconXapianIndex(xapian.WritableDatabase): | |
112 | 123 | """ |
113 | 124 | Data source for popcon submissions defined as a singleton xapian database. |
114 | 125 | """ |
115 | - def __init__(self,cfg): | |
126 | + def __init__(self,cfg,reindex=0,recluster=0): | |
116 | 127 | """ |
117 | 128 | Set initial attributes. |
118 | 129 | """ |
119 | - self.path = os.path.expanduser(cfg.popcon_index) | |
120 | - self.popcon_dir = os.path.expanduser(cfg.popcon_dir) | |
121 | - #self.debtags_path = os.path.expanduser(cfg.tags_db) | |
122 | 130 | self.axi = xapian.Database(cfg.axi) |
123 | - self.load_index() | |
131 | + self.path = os.path.expanduser(cfg.popcon_index) | |
132 | + if reindex or not self.load_index(): | |
133 | + if not os.path.exists(cfg.popcon_dir): | |
134 | + os.makedirs(cfg.popcon_dir) | |
135 | + if not os.listdir(cfg.popcon_dir): | |
136 | + logging.critical("Popcon dir seems to be empty.") | |
137 | + raise Error | |
138 | + if not cfg.clustering: | |
139 | + self.source_dir = os.path.expanduser(cfg.popcon_dir) | |
140 | + else: | |
141 | + self.source_dir = os.path.expanduser(cfg.clusters_dir) | |
142 | + if not os.path.exists(cfg.clusters_dir): | |
143 | + os.makedirs(cfg.clusters_dir) | |
144 | + if not os.listdir(cfg.clusters_dir): | |
145 | + distance = JaccardDistance() | |
146 | + logging.info("Clustering popcon submissions from \'%s\'" | |
147 | + % cfg.popcon_dir) | |
148 | + logging.info("Clusters will be placed at \'%s\'" | |
149 | + % cfg.clusters_dir) | |
150 | + data = self.get_submissions(cfg.popcon_dir) | |
151 | + if cfg.clustering == "Hierarchical": | |
152 | + self.hierarchical_clustering(data,cfg.clusters_dir, | |
153 | + distance) | |
154 | + else: | |
155 | + self.kmedoids_clustering(data,cfg.clusters_dir, | |
156 | + distance) | |
157 | + self.build_index() | |
124 | 158 | |
125 | - def parse_submission(self,submission_path,binary=1): | |
126 | - """ | |
127 | - Parse a popcon submission, generating the names of the valid packages | |
128 | - in the vote. | |
129 | - """ | |
130 | - submission = open(submission_path) | |
131 | - for line in submission: | |
132 | - if not line.startswith("POPULARITY"): | |
133 | - if not line.startswith("END-POPULARITY"): | |
134 | - data = line[:-1].split(" ") | |
135 | - if len(data) > 3: | |
136 | - if binary: | |
137 | - # every installed package has the same weight | |
138 | - yield data[2], 1 | |
139 | - elif data[3] == '<NOFILES>': | |
140 | - # No executable files to track | |
141 | - yield data[2], 1 | |
142 | - elif len(data) == 4: | |
143 | - # Recently used packages | |
144 | - yield data[2], 10 | |
145 | - elif data[4] == '<OLD>': | |
146 | - # Unused packages | |
147 | - yield data[2], 3 | |
148 | - elif data[4] == '<RECENT-CTIME>': | |
149 | - # Recently installed packages | |
150 | - yield data[2], 8 | |
159 | + def __str__(self): | |
160 | + return print_index(self) | |
151 | 161 | |
152 | 162 | def load_index(self): |
153 | 163 | """ |
... | ... | @@ -159,19 +169,19 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
159 | 169 | xapian.Database.__init__(self,self.path) |
160 | 170 | except xapian.DatabaseError: |
161 | 171 | logging.info("Could not open popcon index.") |
162 | - self.new_index() | |
172 | + return 0 | |
163 | 173 | |
164 | - def new_index(self): | |
174 | + def build_index(self): | |
165 | 175 | """ |
166 | - Create a xapian index for popcon submissions at 'popcon_dir' and | |
176 | + Create a xapian index for popcon submissions at 'source_dir' and | |
167 | 177 | place it at 'self.path'. |
168 | 178 | """ |
169 | - if not os.path.exists(self.path): | |
170 | - os.makedirs(self.path) | |
179 | + shutil.rmtree(self.path,1) | |
180 | + os.makedirs(self.path) | |
171 | 181 | |
172 | 182 | try: |
173 | 183 | logging.info("Indexing popcon submissions from \'%s\'" % |
174 | - self.popcon_dir) | |
184 | + self.source_dir) | |
175 | 185 | logging.info("Creating new xapian index at \'%s\'" % |
176 | 186 | self.path) |
177 | 187 | xapian.WritableDatabase.__init__(self,self.path, |
... | ... | @@ -180,123 +190,79 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
180 | 190 | logging.critical("Could not create popcon xapian index.") |
181 | 191 | raise Error |
182 | 192 | |
183 | - for root, dirs, files in os.walk(self.popcon_dir): | |
184 | - for submission in files: | |
185 | - submission_path = os.path.join(root, submission) | |
193 | + for root, dirs, files in os.walk(self.source_dir): | |
194 | + for popcon_file in files: | |
195 | + submission = PopconSubmission(os.path.join(root, popcon_file)) | |
186 | 196 | doc = xapian.Document() |
187 | - doc.set_data(submission) | |
188 | - logging.debug("Parsing popcon submission at \'%s\'" % | |
189 | - submission_path) | |
190 | - for pkg, freq in self.parse_submission(submission_path): | |
197 | + doc.set_data(submission.user_id) | |
198 | + logging.debug("Parsing popcon submission \'%s\'" % | |
199 | + submission.user_id) | |
200 | + for pkg, freq in submission.packages.items(): | |
191 | 201 | doc.add_term("XP"+pkg,freq) |
192 | 202 | for tag in axi_search_pkg_tags(self.axi,pkg): |
193 | - print tag | |
194 | 203 | doc.add_term(tag,freq) |
195 | 204 | doc_id = self.add_document(doc) |
196 | 205 | logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) |
197 | 206 | # python garbage collector |
198 | 207 | gc.collect() |
199 | 208 | # flush to disk database changes |
200 | - self.flush() | |
209 | + self.commit() | |
201 | 210 | |
202 | -class PopconClusteredData(Singleton): | |
203 | - """ | |
204 | - Data source for popcon submissions defined as a singleton xapian database. | |
205 | - """ | |
206 | - def __init__(self,cfg): | |
211 | + def get_submissions(self,submissions_dir): | |
207 | 212 | """ |
208 | - Set initial attributes. | |
213 | + Get popcon submissions from popcon_dir | |
209 | 214 | """ |
210 | - self.popcon_dir = os.path.expanduser(cfg.popcon_dir) | |
211 | - self.clusters_dir = os.path.expanduser(cfg.clusters_dir) | |
212 | - self.submissions = [] | |
213 | - self.clustering() | |
215 | + submissions = [] | |
216 | + for root, dirs, files in os.walk(submissions_dir): | |
217 | + for popcon_file in files: | |
218 | + submission = PopconSubmission(os.path.join(root, popcon_file)) | |
219 | + submissions.append(submission) | |
220 | + return submissions | |
214 | 221 | |
215 | - def parse_submission(self,submission_path,binary=1): | |
216 | - """ | |
217 | - Parse a popcon submission, generating the names of the valid packages | |
218 | - in the vote. | |
219 | - """ | |
220 | - submission_file = open(submission_path) | |
221 | - for line in submission_file: | |
222 | - if not line.startswith("POPULARITY"): | |
223 | - if not line.startswith("END-POPULARITY"): | |
224 | - data = line[:-1].split(" ") | |
225 | - if len(data) > 3: | |
226 | - if binary: | |
227 | - # every installed package has the same weight | |
228 | - yield data[2], 1 | |
229 | - elif data[3] == '<NOFILES>': | |
230 | - # No executable files to track | |
231 | - yield data[2], 1 | |
232 | - elif len(data) == 4: | |
233 | - # Recently used packages | |
234 | - yield data[2], 10 | |
235 | - elif data[4] == '<OLD>': | |
236 | - # Unused packages | |
237 | - yield data[2], 3 | |
238 | - elif data[4] == '<RECENT-CTIME>': | |
239 | - # Recently installed packages | |
240 | - yield data[2], 8 | |
241 | - | |
242 | - def clustering(self): | |
222 | + def hierarchical_clustering(self,data,clusters_dir,distance,k=10): | |
243 | 223 | """ |
244 | - called by init | |
245 | - Create a xapian index for popcon submissions at 'popcon_dir' and | |
246 | - place it at 'self.path'. | |
224 | + Select popcon submissions from popcon_dir and place them at clusters_dir | |
247 | 225 | """ |
248 | - if not os.path.exists(self.clusters_dir): | |
249 | - os.makedirs(self.clusters_dir) | |
250 | - | |
251 | - logging.info("Clustering popcon submissions from \'%s\'" % | |
252 | - self.popcon_dir) | |
253 | - logging.info("Clusters will be placed at \'%s\'" % self.clusters_dir) | |
226 | + cl = cluster.HierarchicalClustering(data, lambda x,y: | |
227 | + distance(x.packages.keys(), | |
228 | + y.packages.keys())) | |
229 | + clusters = cl.getlevel(0.5) | |
230 | + for c in clusters: | |
231 | + print "cluster" | |
232 | + for submission in c: | |
233 | + print submission.user_id | |
254 | 234 | |
255 | - for root, dirs, files in os.walk(self.popcon_dir): | |
256 | - for submission_hash in files: | |
257 | - s = PopconSubmission(submission_hash) | |
258 | - submission_path = os.path.join(root, submission_hash) | |
259 | - logging.debug("Parsing popcon submission \'%s\'" % | |
260 | - submission_hash) | |
261 | - for pkg, freq in self.parse_submission(submission_path): | |
262 | - s.add_pkg(pkg) | |
263 | - self.submissions.append(s) | |
264 | - | |
265 | - distanceFunction = JaccardDistance() | |
266 | - # cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | |
267 | - # clusters = cl.getlevel(0.5) | |
268 | - # for c in clusters: | |
269 | - # print "cluster" | |
270 | - # for submission in c: | |
271 | - # print submission.hash | |
272 | - cl = KMedoidsClusteringPopcon(self.submissions, lambda x,y: \ | |
273 | - distanceFunction(x.pkgs_list,y.pkgs_list)) | |
274 | - #clusters = cl.getclusters(2) | |
275 | - medoids = cl.getMedoids(2) | |
276 | - print "medoids" | |
277 | - for m in medoids: | |
278 | - print m.hash | |
235 | + def kmedoids_clustering(self,data,clusters_dir,distance,k=10): | |
236 | + clusters = KMedoidsClustering(data,lambda x,y: | |
237 | + distance(x.packages.keys(), | |
238 | + y.packages.keys())) | |
239 | + medoids = clusters.getMedoids(2) | |
240 | + for submission in medoids: | |
241 | + shutil.copyfile(submission.path,os.path.join(clusters_dir, | |
242 | + submission.user_id)) | |
279 | 243 | |
280 | -class KMedoidsClusteringPopcon(cluster.KMeansClustering): | |
244 | +class KMedoidsClustering(cluster.KMeansClustering): | |
281 | 245 | |
282 | 246 | def __init__(self,data,distance): |
283 | - if len(data)>100: | |
247 | + if len(data)<100: | |
248 | + data_sample = data | |
249 | + else: | |
284 | 250 | data_sample = random.sample(data,100) |
285 | 251 | cluster.KMeansClustering.__init__(self, data_sample, distance) |
286 | 252 | self.distanceMatrix = {} |
287 | 253 | for submission in self._KMeansClustering__data: |
288 | - self.distanceMatrix[submission.hash] = {} | |
254 | + self.distanceMatrix[submission.user_id] = {} | |
289 | 255 | |
290 | 256 | def loadDistanceMatrix(self,cluster): |
291 | 257 | for i in range(len(cluster)-1): |
292 | 258 | for j in range(i+1,len(cluster)): |
293 | 259 | try: |
294 | - d = self.distanceMatrix[cluster[i].hash][cluster[j].hash] | |
260 | + d = self.distanceMatrix[cluster[i].user_id][cluster[j].user_id] | |
295 | 261 | logging.debug("Using d[%d,%d]" % (i,j)) |
296 | 262 | except: |
297 | 263 | d = self.distance(cluster[i],cluster[j]) |
298 | - self.distanceMatrix[cluster[i].hash][cluster[j].hash] = d | |
299 | - self.distanceMatrix[cluster[j].hash][cluster[i].hash] = d | |
264 | + self.distanceMatrix[cluster[i].user_id][cluster[j].user_id] = d | |
265 | + self.distanceMatrix[cluster[j].user_id][cluster[i].user_id] = d | |
300 | 266 | logging.debug("d[%d,%d] = %.2f" % (i,j,d)) |
301 | 267 | |
302 | 268 | def getMedoid(self,cluster): |
... | ... | @@ -308,22 +274,19 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): |
308 | 274 | self.loadDistanceMatrix(cluster) |
309 | 275 | medoidDistance = sys.maxint |
310 | 276 | for i in range(len(cluster)): |
311 | - totalDistance = sum(self.distanceMatrix[cluster[i].hash].values()) | |
277 | + totalDistance = sum(self.distanceMatrix[cluster[i].user_id].values()) | |
312 | 278 | print "totalDistance[",i,"]=",totalDistance |
313 | 279 | if totalDistance < medoidDistance: |
314 | 280 | medoidDistance = totalDistance |
315 | 281 | medoid = i |
316 | 282 | print "medoidDistance:",medoidDistance |
317 | - logging.debug("Cluster medoid: [%d] %s" % (medoid, cluster[medoid].hash)) | |
283 | + logging.debug("Cluster medoid: [%d] %s" % (medoid, | |
284 | + cluster[medoid].user_id)) | |
318 | 285 | return cluster[medoid] |
319 | 286 | |
320 | 287 | def assign_item(self, item, origin): |
321 | 288 | """ |
322 | 289 | Assigns an item from a given cluster to the closest located cluster |
323 | - | |
324 | - PARAMETERS | |
325 | - item - the item to be moved | |
326 | - origin - the originating cluster | |
327 | 290 | """ |
328 | 291 | closest_cluster = origin |
329 | 292 | for cluster in self._KMeansClustering__clusters: |
... | ... | @@ -332,7 +295,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): |
332 | 295 | |
333 | 296 | if closest_cluster != origin: |
334 | 297 | self.move_item(item, origin, closest_cluster) |
335 | - logging.debug("Item changed cluster: %s" % item.hash) | |
298 | + logging.debug("Item changed cluster: %s" % item.user_id) | |
336 | 299 | return True |
337 | 300 | else: |
338 | 301 | return False |
... | ... | @@ -342,5 +305,5 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): |
342 | 305 | Generate n clusters and return their medoids. |
343 | 306 | """ |
344 | 307 | medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] |
345 | - logging.info("Clustering completed and the following centroids were found: %s" % [c.hash for c in medoids]) | |
308 | + logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids]) | |
346 | 309 | return medoids | ... | ... |