Commit 353b42add083cccd2d18764ef0ec6e6b6b1878b6

Authored by Tássia Camões Araújo
1 parent 37e376c1
Exists in master and in 1 other branch add_vagrant

Data classes complete refactoring.

Showing 1 changed file with 134 additions and 171 deletions   Show diff stats
@@ -22,17 +22,14 @@ __license__ = """ @@ -22,17 +22,14 @@ __license__ = """
22 import os 22 import os
23 import sys 23 import sys
24 import gc 24 import gc
25 -import re  
26 import xapian 25 import xapian
27 -import axi  
28 -from debian import debtags  
29 import logging 26 import logging
30 -import hashlib  
31 import random 27 import random
  28 +import cluster
  29 +import shutil
32 30
33 from error import Error 31 from error import Error
34 from singleton import Singleton 32 from singleton import Singleton
35 -import cluster  
36 from dissimilarity import * 33 from dissimilarity import *
37 34
38 def axi_search_pkgs(axi,pkgs_list): 35 def axi_search_pkgs(axi,pkgs_list):
@@ -53,101 +50,114 @@ def axi_search_pkg_tags(axi,pkg): @@ -53,101 +50,114 @@ def axi_search_pkg_tags(axi,pkg):
53 term.term.startswith("XT")] 50 term.term.startswith("XT")]
54 return tags 51 return tags
55 52
  53 +def print_index(index):
  54 + output = "\n---\n" + xapian.Database.__repr__(index) + "\n---\n"
  55 + for term in index.allterms():
  56 + output += term.term+"\n"
  57 + output += str([index.get_document(posting.docid).get_data()
  58 + for posting in index.postlist(term.term)])
  59 + output += "\n---"
  60 + return output
  61 +
56 class SampleAptXapianIndex(xapian.WritableDatabase): 62 class SampleAptXapianIndex(xapian.WritableDatabase):
57 """ 63 """
58 Sample data source for packages information, mainly useful for tests. 64 Sample data source for packages information, mainly useful for tests.
59 """ 65 """
60 - def __init__(self,pkgs_list,axi):  
61 - xapian.WritableDatabase.__init__(self,".sample_axi", 66 + def __init__(self,pkgs_list,axi,path):
  67 + xapian.WritableDatabase.__init__(self,path,
62 xapian.DB_CREATE_OR_OVERWRITE) 68 xapian.DB_CREATE_OR_OVERWRITE)
63 sample = axi_search_pkgs(axi,pkgs_list) 69 sample = axi_search_pkgs(axi,pkgs_list)
64 - self.all_docs = []  
65 for package in sample: 70 for package in sample:
66 doc_id = self.add_document(axi.get_document(package.docid)) 71 doc_id = self.add_document(axi.get_document(package.docid))
67 - self.all_docs.append(doc_id)  
68 72
69 - def _print(self):  
70 - print "---"  
71 - print xapian.WritableDatabase.__repr__(self)  
72 - print "---"  
73 - for doc_id in self.all_docs:  
74 - print [term.term for term in self.get_document(doc_id).termlist()]  
75 - print "---" 73 + def __str__(self):
  74 + return print_index(self)
76 75
77 class PopconSubmission(): 76 class PopconSubmission():
78 - def __init__(self,submission_hash):  
79 - self.hash = submission_hash  
80 - self.pkgs_list = [] 77 + def __init__(self,path,user_id=0):
  78 + self.packages = dict()
  79 + self.path = path
  80 + self.load()
  81 + if user_id:
  82 + self.user_id = user_id
81 83
82 - def add_pkg(self,pkg):  
83 - self.pkgs_list.append(pkg) 84 + def __str__(self):
  85 + output = "\nPopularity-contest submission ID "+self.user_id
  86 + for pkg, weight in self.packages.items():
  87 + output += "\n "+pkg+": "+str(weight)
  88 + return output
84 89
85 - def parse_submission(self,submission_path,binary=1): 90 + def load(self,binary=1):
86 """ 91 """
87 Parse a popcon submission, generating the names of the valid packages 92 Parse a popcon submission, generating the names of the valid packages
88 in the vote. 93 in the vote.
89 """ 94 """
90 - submission = open(submission_path)  
91 - for line in submission:  
92 - if not line.startswith("POPULARITY"):  
93 - if not line.startswith("END-POPULARITY"):  
94 - data = line[:-1].split(" ")  
95 - if len(data) > 3:  
96 - if binary:  
97 - # every installed package has the same weight  
98 - yield data[2], 1  
99 - elif data[3] == '<NOFILES>': 95 + with open(self.path) as submission:
  96 + for line in submission:
  97 + if line.startswith("POPULARITY"):
  98 + self.user_id = line.split()[2].lstrip("ID:")
  99 + elif not line.startswith("END-POPULARITY"):
  100 + data = line.rstrip('\n').split()
  101 + if len(data) > 2:
  102 + pkg = data[2]
  103 + if len(data) > 3:
  104 + exec_file = data[3]
  105 + # Binary weight
  106 + if binary:
  107 + self.packages[pkg] = 1
  108 + # Weights inherited from Enrico's anapop
100 # No executable files to track 109 # No executable files to track
101 - yield data[2], 1  
102 - elif len(data) == 4: 110 + elif exec_file == '<NOFILES>':
  111 + self.packages[pkg] = 1
103 # Recently used packages 112 # Recently used packages
104 - yield data[2], 10  
105 - elif data[4] == '<OLD>': 113 + elif len(data) == 4:
  114 + self.packages[pkg] = 10
106 # Unused packages 115 # Unused packages
107 - yield data[2], 3  
108 - elif data[4] == '<RECENT-CTIME>': 116 + elif data[4] == '<OLD>':
  117 + self.packages[pkg] = 3
109 # Recently installed packages 118 # Recently installed packages
110 - yield data[2], 8  
111 -class PopconXapianIndex(xapian.WritableDatabase,Singleton): 119 + elif data[4] == '<RECENT-CTIME>':
  120 + self.packages[pkg] = 8
  121 +
  122 +class PopconXapianIndex(xapian.WritableDatabase):
112 """ 123 """
113 Data source for popcon submissions defined as a singleton xapian database. 124 Data source for popcon submissions defined as a singleton xapian database.
114 """ 125 """
115 - def __init__(self,cfg): 126 + def __init__(self,cfg,reindex=0,recluster=0):
116 """ 127 """
117 Set initial attributes. 128 Set initial attributes.
118 """ 129 """
119 - self.path = os.path.expanduser(cfg.popcon_index)  
120 - self.popcon_dir = os.path.expanduser(cfg.popcon_dir)  
121 - #self.debtags_path = os.path.expanduser(cfg.tags_db)  
122 self.axi = xapian.Database(cfg.axi) 130 self.axi = xapian.Database(cfg.axi)
123 - self.load_index() 131 + self.path = os.path.expanduser(cfg.popcon_index)
  132 + if reindex or not self.load_index():
  133 + if not os.path.exists(cfg.popcon_dir):
  134 + os.makedirs(cfg.popcon_dir)
  135 + if not os.listdir(cfg.popcon_dir):
  136 + logging.critical("Popcon dir seems to be empty.")
  137 + raise Error
  138 + if not cfg.clustering:
  139 + self.source_dir = os.path.expanduser(cfg.popcon_dir)
  140 + else:
  141 + self.source_dir = os.path.expanduser(cfg.clusters_dir)
  142 + if not os.path.exists(cfg.clusters_dir):
  143 + os.makedirs(cfg.clusters_dir)
  144 + if not os.listdir(cfg.clusters_dir):
  145 + distance = JaccardDistance()
  146 + logging.info("Clustering popcon submissions from \'%s\'"
  147 + % cfg.popcon_dir)
  148 + logging.info("Clusters will be placed at \'%s\'"
  149 + % cfg.clusters_dir)
  150 + data = self.get_submissions(cfg.popcon_dir)
  151 + if cfg.clustering == "Hierarchical":
  152 + self.hierarchical_clustering(data,cfg.clusters_dir,
  153 + distance)
  154 + else:
  155 + self.kmedoids_clustering(data,cfg.clusters_dir,
  156 + distance)
  157 + self.build_index()
124 158
125 - def parse_submission(self,submission_path,binary=1):  
126 - """  
127 - Parse a popcon submission, generating the names of the valid packages  
128 - in the vote.  
129 - """  
130 - submission = open(submission_path)  
131 - for line in submission:  
132 - if not line.startswith("POPULARITY"):  
133 - if not line.startswith("END-POPULARITY"):  
134 - data = line[:-1].split(" ")  
135 - if len(data) > 3:  
136 - if binary:  
137 - # every installed package has the same weight  
138 - yield data[2], 1  
139 - elif data[3] == '<NOFILES>':  
140 - # No executable files to track  
141 - yield data[2], 1  
142 - elif len(data) == 4:  
143 - # Recently used packages  
144 - yield data[2], 10  
145 - elif data[4] == '<OLD>':  
146 - # Unused packages  
147 - yield data[2], 3  
148 - elif data[4] == '<RECENT-CTIME>':  
149 - # Recently installed packages  
150 - yield data[2], 8 159 + def __str__(self):
  160 + return print_index(self)
151 161
152 def load_index(self): 162 def load_index(self):
153 """ 163 """
@@ -159,19 +169,19 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): @@ -159,19 +169,19 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton):
159 xapian.Database.__init__(self,self.path) 169 xapian.Database.__init__(self,self.path)
160 except xapian.DatabaseError: 170 except xapian.DatabaseError:
161 logging.info("Could not open popcon index.") 171 logging.info("Could not open popcon index.")
162 - self.new_index() 172 + return 0
163 173
164 - def new_index(self): 174 + def build_index(self):
165 """ 175 """
166 - Create a xapian index for popcon submissions at 'popcon_dir' and 176 + Create a xapian index for popcon submissions at 'source_dir' and
167 place it at 'self.path'. 177 place it at 'self.path'.
168 """ 178 """
169 - if not os.path.exists(self.path):  
170 - os.makedirs(self.path) 179 + shutil.rmtree(self.path,1)
  180 + os.makedirs(self.path)
171 181
172 try: 182 try:
173 logging.info("Indexing popcon submissions from \'%s\'" % 183 logging.info("Indexing popcon submissions from \'%s\'" %
174 - self.popcon_dir) 184 + self.source_dir)
175 logging.info("Creating new xapian index at \'%s\'" % 185 logging.info("Creating new xapian index at \'%s\'" %
176 self.path) 186 self.path)
177 xapian.WritableDatabase.__init__(self,self.path, 187 xapian.WritableDatabase.__init__(self,self.path,
@@ -180,123 +190,79 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): @@ -180,123 +190,79 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton):
180 logging.critical("Could not create popcon xapian index.") 190 logging.critical("Could not create popcon xapian index.")
181 raise Error 191 raise Error
182 192
183 - for root, dirs, files in os.walk(self.popcon_dir):  
184 - for submission in files:  
185 - submission_path = os.path.join(root, submission) 193 + for root, dirs, files in os.walk(self.source_dir):
  194 + for popcon_file in files:
  195 + submission = PopconSubmission(os.path.join(root, popcon_file))
186 doc = xapian.Document() 196 doc = xapian.Document()
187 - doc.set_data(submission)  
188 - logging.debug("Parsing popcon submission at \'%s\'" %  
189 - submission_path)  
190 - for pkg, freq in self.parse_submission(submission_path): 197 + doc.set_data(submission.user_id)
  198 + logging.debug("Parsing popcon submission \'%s\'" %
  199 + submission.user_id)
  200 + for pkg, freq in submission.packages.items():
191 doc.add_term("XP"+pkg,freq) 201 doc.add_term("XP"+pkg,freq)
192 for tag in axi_search_pkg_tags(self.axi,pkg): 202 for tag in axi_search_pkg_tags(self.axi,pkg):
193 - print tag  
194 doc.add_term(tag,freq) 203 doc.add_term(tag,freq)
195 doc_id = self.add_document(doc) 204 doc_id = self.add_document(doc)
196 logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) 205 logging.debug("Popcon Xapian: Indexing doc %d" % doc_id)
197 # python garbage collector 206 # python garbage collector
198 gc.collect() 207 gc.collect()
199 # flush to disk database changes 208 # flush to disk database changes
200 - self.flush() 209 + self.commit()
201 210
202 -class PopconClusteredData(Singleton):  
203 - """  
204 - Data source for popcon submissions defined as a singleton xapian database.  
205 - """  
206 - def __init__(self,cfg): 211 + def get_submissions(self,submissions_dir):
207 """ 212 """
208 - Set initial attributes. 213 + Get popcon submissions from popcon_dir
209 """ 214 """
210 - self.popcon_dir = os.path.expanduser(cfg.popcon_dir)  
211 - self.clusters_dir = os.path.expanduser(cfg.clusters_dir)  
212 - self.submissions = []  
213 - self.clustering() 215 + submissions = []
  216 + for root, dirs, files in os.walk(submissions_dir):
  217 + for popcon_file in files:
  218 + submission = PopconSubmission(os.path.join(root, popcon_file))
  219 + submissions.append(submission)
  220 + return submissions
214 221
215 - def parse_submission(self,submission_path,binary=1):  
216 - """  
217 - Parse a popcon submission, generating the names of the valid packages  
218 - in the vote.  
219 - """  
220 - submission_file = open(submission_path)  
221 - for line in submission_file:  
222 - if not line.startswith("POPULARITY"):  
223 - if not line.startswith("END-POPULARITY"):  
224 - data = line[:-1].split(" ")  
225 - if len(data) > 3:  
226 - if binary:  
227 - # every installed package has the same weight  
228 - yield data[2], 1  
229 - elif data[3] == '<NOFILES>':  
230 - # No executable files to track  
231 - yield data[2], 1  
232 - elif len(data) == 4:  
233 - # Recently used packages  
234 - yield data[2], 10  
235 - elif data[4] == '<OLD>':  
236 - # Unused packages  
237 - yield data[2], 3  
238 - elif data[4] == '<RECENT-CTIME>':  
239 - # Recently installed packages  
240 - yield data[2], 8  
241 -  
242 - def clustering(self): 222 + def hierarchical_clustering(self,data,clusters_dir,distance,k=10):
243 """ 223 """
244 - called by init  
245 - Create a xapian index for popcon submissions at 'popcon_dir' and  
246 - place it at 'self.path'. 224 + Select popcon submissions from popcon_dir and place them at clusters_dir
247 """ 225 """
248 - if not os.path.exists(self.clusters_dir):  
249 - os.makedirs(self.clusters_dir)  
250 -  
251 - logging.info("Clustering popcon submissions from \'%s\'" %  
252 - self.popcon_dir)  
253 - logging.info("Clusters will be placed at \'%s\'" % self.clusters_dir) 226 + cl = cluster.HierarchicalClustering(data, lambda x,y:
  227 + distance(x.packages.keys(),
  228 + y.packages.keys()))
  229 + clusters = cl.getlevel(0.5)
  230 + for c in clusters:
  231 + print "cluster"
  232 + for submission in c:
  233 + print submission.user_id
254 234
255 - for root, dirs, files in os.walk(self.popcon_dir):  
256 - for submission_hash in files:  
257 - s = PopconSubmission(submission_hash)  
258 - submission_path = os.path.join(root, submission_hash)  
259 - logging.debug("Parsing popcon submission \'%s\'" %  
260 - submission_hash)  
261 - for pkg, freq in self.parse_submission(submission_path):  
262 - s.add_pkg(pkg)  
263 - self.submissions.append(s)  
264 -  
265 - distanceFunction = JaccardDistance()  
266 - # cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list))  
267 - # clusters = cl.getlevel(0.5)  
268 - # for c in clusters:  
269 - # print "cluster"  
270 - # for submission in c:  
271 - # print submission.hash  
272 - cl = KMedoidsClusteringPopcon(self.submissions, lambda x,y: \  
273 - distanceFunction(x.pkgs_list,y.pkgs_list))  
274 - #clusters = cl.getclusters(2)  
275 - medoids = cl.getMedoids(2)  
276 - print "medoids"  
277 - for m in medoids:  
278 - print m.hash 235 + def kmedoids_clustering(self,data,clusters_dir,distance,k=10):
  236 + clusters = KMedoidsClustering(data,lambda x,y:
  237 + distance(x.packages.keys(),
  238 + y.packages.keys()))
  239 + medoids = clusters.getMedoids(2)
  240 + for submission in medoids:
  241 + shutil.copyfile(submission.path,os.path.join(clusters_dir,
  242 + submission.user_id))
279 243
280 -class KMedoidsClusteringPopcon(cluster.KMeansClustering): 244 +class KMedoidsClustering(cluster.KMeansClustering):
281 245
282 def __init__(self,data,distance): 246 def __init__(self,data,distance):
283 - if len(data)>100: 247 + if len(data)<100:
  248 + data_sample = data
  249 + else:
284 data_sample = random.sample(data,100) 250 data_sample = random.sample(data,100)
285 cluster.KMeansClustering.__init__(self, data_sample, distance) 251 cluster.KMeansClustering.__init__(self, data_sample, distance)
286 self.distanceMatrix = {} 252 self.distanceMatrix = {}
287 for submission in self._KMeansClustering__data: 253 for submission in self._KMeansClustering__data:
288 - self.distanceMatrix[submission.hash] = {} 254 + self.distanceMatrix[submission.user_id] = {}
289 255
290 def loadDistanceMatrix(self,cluster): 256 def loadDistanceMatrix(self,cluster):
291 for i in range(len(cluster)-1): 257 for i in range(len(cluster)-1):
292 for j in range(i+1,len(cluster)): 258 for j in range(i+1,len(cluster)):
293 try: 259 try:
294 - d = self.distanceMatrix[cluster[i].hash][cluster[j].hash] 260 + d = self.distanceMatrix[cluster[i].user_id][cluster[j].user_id]
295 logging.debug("Using d[%d,%d]" % (i,j)) 261 logging.debug("Using d[%d,%d]" % (i,j))
296 except: 262 except:
297 d = self.distance(cluster[i],cluster[j]) 263 d = self.distance(cluster[i],cluster[j])
298 - self.distanceMatrix[cluster[i].hash][cluster[j].hash] = d  
299 - self.distanceMatrix[cluster[j].hash][cluster[i].hash] = d 264 + self.distanceMatrix[cluster[i].user_id][cluster[j].user_id] = d
  265 + self.distanceMatrix[cluster[j].user_id][cluster[i].user_id] = d
300 logging.debug("d[%d,%d] = %.2f" % (i,j,d)) 266 logging.debug("d[%d,%d] = %.2f" % (i,j,d))
301 267
302 def getMedoid(self,cluster): 268 def getMedoid(self,cluster):
@@ -308,22 +274,19 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): @@ -308,22 +274,19 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering):
308 self.loadDistanceMatrix(cluster) 274 self.loadDistanceMatrix(cluster)
309 medoidDistance = sys.maxint 275 medoidDistance = sys.maxint
310 for i in range(len(cluster)): 276 for i in range(len(cluster)):
311 - totalDistance = sum(self.distanceMatrix[cluster[i].hash].values()) 277 + totalDistance = sum(self.distanceMatrix[cluster[i].user_id].values())
312 print "totalDistance[",i,"]=",totalDistance 278 print "totalDistance[",i,"]=",totalDistance
313 if totalDistance < medoidDistance: 279 if totalDistance < medoidDistance:
314 medoidDistance = totalDistance 280 medoidDistance = totalDistance
315 medoid = i 281 medoid = i
316 print "medoidDistance:",medoidDistance 282 print "medoidDistance:",medoidDistance
317 - logging.debug("Cluster medoid: [%d] %s" % (medoid, cluster[medoid].hash)) 283 + logging.debug("Cluster medoid: [%d] %s" % (medoid,
  284 + cluster[medoid].user_id))
318 return cluster[medoid] 285 return cluster[medoid]
319 286
320 def assign_item(self, item, origin): 287 def assign_item(self, item, origin):
321 """ 288 """
322 Assigns an item from a given cluster to the closest located cluster 289 Assigns an item from a given cluster to the closest located cluster
323 -  
324 - PARAMETERS  
325 - item - the item to be moved  
326 - origin - the originating cluster  
327 """ 290 """
328 closest_cluster = origin 291 closest_cluster = origin
329 for cluster in self._KMeansClustering__clusters: 292 for cluster in self._KMeansClustering__clusters:
@@ -332,7 +295,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): @@ -332,7 +295,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering):
332 295
333 if closest_cluster != origin: 296 if closest_cluster != origin:
334 self.move_item(item, origin, closest_cluster) 297 self.move_item(item, origin, closest_cluster)
335 - logging.debug("Item changed cluster: %s" % item.hash) 298 + logging.debug("Item changed cluster: %s" % item.user_id)
336 return True 299 return True
337 else: 300 else:
338 return False 301 return False
@@ -342,5 +305,5 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): @@ -342,5 +305,5 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering):
342 Generate n clusters and return their medoids. 305 Generate n clusters and return their medoids.
343 """ 306 """
344 medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] 307 medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)]
345 - logging.info("Clustering completed and the following centroids were found: %s" % [c.hash for c in medoids]) 308 + logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids])
346 return medoids 309 return medoids