Commit 353b42add083cccd2d18764ef0ec6e6b6b1878b6

Authored by Tássia Camões Araújo
1 parent 37e376c1
Exists in master and in 1 other branch add_vagrant

Data classes complete refactoring.

Showing 1 changed file with 134 additions and 171 deletions   Show diff stats
src/data.py
... ... @@ -22,17 +22,14 @@ __license__ = """
22 22 import os
23 23 import sys
24 24 import gc
25   -import re
26 25 import xapian
27   -import axi
28   -from debian import debtags
29 26 import logging
30   -import hashlib
31 27 import random
  28 +import cluster
  29 +import shutil
32 30  
33 31 from error import Error
34 32 from singleton import Singleton
35   -import cluster
36 33 from dissimilarity import *
37 34  
38 35 def axi_search_pkgs(axi,pkgs_list):
... ... @@ -53,101 +50,114 @@ def axi_search_pkg_tags(axi,pkg):
53 50 term.term.startswith("XT")]
54 51 return tags
55 52  
  53 +def print_index(index):
  54 + output = "\n---\n" + xapian.Database.__repr__(index) + "\n---\n"
  55 + for term in index.allterms():
  56 + output += term.term+"\n"
  57 + output += str([index.get_document(posting.docid).get_data()
  58 + for posting in index.postlist(term.term)])
  59 + output += "\n---"
  60 + return output
  61 +
56 62 class SampleAptXapianIndex(xapian.WritableDatabase):
57 63 """
58 64 Sample data source for packages information, mainly useful for tests.
59 65 """
60   - def __init__(self,pkgs_list,axi):
61   - xapian.WritableDatabase.__init__(self,".sample_axi",
  66 + def __init__(self,pkgs_list,axi,path):
  67 + xapian.WritableDatabase.__init__(self,path,
62 68 xapian.DB_CREATE_OR_OVERWRITE)
63 69 sample = axi_search_pkgs(axi,pkgs_list)
64   - self.all_docs = []
65 70 for package in sample:
66 71 doc_id = self.add_document(axi.get_document(package.docid))
67   - self.all_docs.append(doc_id)
68 72  
69   - def _print(self):
70   - print "---"
71   - print xapian.WritableDatabase.__repr__(self)
72   - print "---"
73   - for doc_id in self.all_docs:
74   - print [term.term for term in self.get_document(doc_id).termlist()]
75   - print "---"
  73 + def __str__(self):
  74 + return print_index(self)
76 75  
77 76 class PopconSubmission():
78   - def __init__(self,submission_hash):
79   - self.hash = submission_hash
80   - self.pkgs_list = []
  77 + def __init__(self,path,user_id=0):
  78 + self.packages = dict()
  79 + self.path = path
  80 + self.load()
  81 + if user_id:
  82 + self.user_id = user_id
81 83  
82   - def add_pkg(self,pkg):
83   - self.pkgs_list.append(pkg)
  84 + def __str__(self):
  85 + output = "\nPopularity-contest submission ID "+self.user_id
  86 + for pkg, weight in self.packages.items():
  87 + output += "\n "+pkg+": "+str(weight)
  88 + return output
84 89  
85   - def parse_submission(self,submission_path,binary=1):
  90 + def load(self,binary=1):
86 91 """
87 92 Parse a popcon submission, generating the names of the valid packages
88 93 in the vote.
89 94 """
90   - submission = open(submission_path)
91   - for line in submission:
92   - if not line.startswith("POPULARITY"):
93   - if not line.startswith("END-POPULARITY"):
94   - data = line[:-1].split(" ")
95   - if len(data) > 3:
96   - if binary:
97   - # every installed package has the same weight
98   - yield data[2], 1
99   - elif data[3] == '<NOFILES>':
  95 + with open(self.path) as submission:
  96 + for line in submission:
  97 + if line.startswith("POPULARITY"):
  98 + self.user_id = line.split()[2].lstrip("ID:")
  99 + elif not line.startswith("END-POPULARITY"):
  100 + data = line.rstrip('\n').split()
  101 + if len(data) > 2:
  102 + pkg = data[2]
  103 + if len(data) > 3:
  104 + exec_file = data[3]
  105 + # Binary weight
  106 + if binary:
  107 + self.packages[pkg] = 1
  108 + # Weights inherited from Enrico's anapop
100 109 # No executable files to track
101   - yield data[2], 1
102   - elif len(data) == 4:
  110 + elif exec_file == '<NOFILES>':
  111 + self.packages[pkg] = 1
103 112 # Recently used packages
104   - yield data[2], 10
105   - elif data[4] == '<OLD>':
  113 + elif len(data) == 4:
  114 + self.packages[pkg] = 10
106 115 # Unused packages
107   - yield data[2], 3
108   - elif data[4] == '<RECENT-CTIME>':
  116 + elif data[4] == '<OLD>':
  117 + self.packages[pkg] = 3
109 118 # Recently installed packages
110   - yield data[2], 8
111   -class PopconXapianIndex(xapian.WritableDatabase,Singleton):
  119 + elif data[4] == '<RECENT-CTIME>':
  120 + self.packages[pkg] = 8
  121 +
  122 +class PopconXapianIndex(xapian.WritableDatabase):
112 123 """
113 124 Data source for popcon submissions defined as a singleton xapian database.
114 125 """
115   - def __init__(self,cfg):
  126 + def __init__(self,cfg,reindex=0,recluster=0):
116 127 """
117 128 Set initial attributes.
118 129 """
119   - self.path = os.path.expanduser(cfg.popcon_index)
120   - self.popcon_dir = os.path.expanduser(cfg.popcon_dir)
121   - #self.debtags_path = os.path.expanduser(cfg.tags_db)
122 130 self.axi = xapian.Database(cfg.axi)
123   - self.load_index()
  131 + self.path = os.path.expanduser(cfg.popcon_index)
  132 + if reindex or not self.load_index():
  133 + if not os.path.exists(cfg.popcon_dir):
  134 + os.makedirs(cfg.popcon_dir)
  135 + if not os.listdir(cfg.popcon_dir):
  136 + logging.critical("Popcon dir seems to be empty.")
  137 + raise Error
  138 + if not cfg.clustering:
  139 + self.source_dir = os.path.expanduser(cfg.popcon_dir)
  140 + else:
  141 + self.source_dir = os.path.expanduser(cfg.clusters_dir)
  142 + if not os.path.exists(cfg.clusters_dir):
  143 + os.makedirs(cfg.clusters_dir)
  144 + if not os.listdir(cfg.clusters_dir):
  145 + distance = JaccardDistance()
  146 + logging.info("Clustering popcon submissions from \'%s\'"
  147 + % cfg.popcon_dir)
  148 + logging.info("Clusters will be placed at \'%s\'"
  149 + % cfg.clusters_dir)
  150 + data = self.get_submissions(cfg.popcon_dir)
  151 + if cfg.clustering == "Hierarchical":
  152 + self.hierarchical_clustering(data,cfg.clusters_dir,
  153 + distance)
  154 + else:
  155 + self.kmedoids_clustering(data,cfg.clusters_dir,
  156 + distance)
  157 + self.build_index()
124 158  
125   - def parse_submission(self,submission_path,binary=1):
126   - """
127   - Parse a popcon submission, generating the names of the valid packages
128   - in the vote.
129   - """
130   - submission = open(submission_path)
131   - for line in submission:
132   - if not line.startswith("POPULARITY"):
133   - if not line.startswith("END-POPULARITY"):
134   - data = line[:-1].split(" ")
135   - if len(data) > 3:
136   - if binary:
137   - # every installed package has the same weight
138   - yield data[2], 1
139   - elif data[3] == '<NOFILES>':
140   - # No executable files to track
141   - yield data[2], 1
142   - elif len(data) == 4:
143   - # Recently used packages
144   - yield data[2], 10
145   - elif data[4] == '<OLD>':
146   - # Unused packages
147   - yield data[2], 3
148   - elif data[4] == '<RECENT-CTIME>':
149   - # Recently installed packages
150   - yield data[2], 8
  159 + def __str__(self):
  160 + return print_index(self)
151 161  
152 162 def load_index(self):
153 163 """
... ... @@ -159,19 +169,19 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton):
159 169 xapian.Database.__init__(self,self.path)
160 170 except xapian.DatabaseError:
161 171 logging.info("Could not open popcon index.")
162   - self.new_index()
  172 + return 0
163 173  
164   - def new_index(self):
  174 + def build_index(self):
165 175 """
166   - Create a xapian index for popcon submissions at 'popcon_dir' and
  176 + Create a xapian index for popcon submissions at 'source_dir' and
167 177 place it at 'self.path'.
168 178 """
169   - if not os.path.exists(self.path):
170   - os.makedirs(self.path)
  179 + shutil.rmtree(self.path,1)
  180 + os.makedirs(self.path)
171 181  
172 182 try:
173 183 logging.info("Indexing popcon submissions from \'%s\'" %
174   - self.popcon_dir)
  184 + self.source_dir)
175 185 logging.info("Creating new xapian index at \'%s\'" %
176 186 self.path)
177 187 xapian.WritableDatabase.__init__(self,self.path,
... ... @@ -180,123 +190,79 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton):
180 190 logging.critical("Could not create popcon xapian index.")
181 191 raise Error
182 192  
183   - for root, dirs, files in os.walk(self.popcon_dir):
184   - for submission in files:
185   - submission_path = os.path.join(root, submission)
  193 + for root, dirs, files in os.walk(self.source_dir):
  194 + for popcon_file in files:
  195 + submission = PopconSubmission(os.path.join(root, popcon_file))
186 196 doc = xapian.Document()
187   - doc.set_data(submission)
188   - logging.debug("Parsing popcon submission at \'%s\'" %
189   - submission_path)
190   - for pkg, freq in self.parse_submission(submission_path):
  197 + doc.set_data(submission.user_id)
  198 + logging.debug("Parsing popcon submission \'%s\'" %
  199 + submission.user_id)
  200 + for pkg, freq in submission.packages.items():
191 201 doc.add_term("XP"+pkg,freq)
192 202 for tag in axi_search_pkg_tags(self.axi,pkg):
193   - print tag
194 203 doc.add_term(tag,freq)
195 204 doc_id = self.add_document(doc)
196 205 logging.debug("Popcon Xapian: Indexing doc %d" % doc_id)
197 206 # python garbage collector
198 207 gc.collect()
199 208 # flush to disk database changes
200   - self.flush()
  209 + self.commit()
201 210  
202   -class PopconClusteredData(Singleton):
203   - """
204   - Data source for popcon submissions defined as a singleton xapian database.
205   - """
206   - def __init__(self,cfg):
  211 + def get_submissions(self,submissions_dir):
207 212 """
208   - Set initial attributes.
  213 + Get popcon submissions from popcon_dir
209 214 """
210   - self.popcon_dir = os.path.expanduser(cfg.popcon_dir)
211   - self.clusters_dir = os.path.expanduser(cfg.clusters_dir)
212   - self.submissions = []
213   - self.clustering()
  215 + submissions = []
  216 + for root, dirs, files in os.walk(submissions_dir):
  217 + for popcon_file in files:
  218 + submission = PopconSubmission(os.path.join(root, popcon_file))
  219 + submissions.append(submission)
  220 + return submissions
214 221  
215   - def parse_submission(self,submission_path,binary=1):
216   - """
217   - Parse a popcon submission, generating the names of the valid packages
218   - in the vote.
219   - """
220   - submission_file = open(submission_path)
221   - for line in submission_file:
222   - if not line.startswith("POPULARITY"):
223   - if not line.startswith("END-POPULARITY"):
224   - data = line[:-1].split(" ")
225   - if len(data) > 3:
226   - if binary:
227   - # every installed package has the same weight
228   - yield data[2], 1
229   - elif data[3] == '<NOFILES>':
230   - # No executable files to track
231   - yield data[2], 1
232   - elif len(data) == 4:
233   - # Recently used packages
234   - yield data[2], 10
235   - elif data[4] == '<OLD>':
236   - # Unused packages
237   - yield data[2], 3
238   - elif data[4] == '<RECENT-CTIME>':
239   - # Recently installed packages
240   - yield data[2], 8
241   -
242   - def clustering(self):
  222 + def hierarchical_clustering(self,data,clusters_dir,distance,k=10):
243 223 """
244   - called by init
245   - Create a xapian index for popcon submissions at 'popcon_dir' and
246   - place it at 'self.path'.
  224 + Select popcon submissions from popcon_dir and place them at clusters_dir
247 225 """
248   - if not os.path.exists(self.clusters_dir):
249   - os.makedirs(self.clusters_dir)
250   -
251   - logging.info("Clustering popcon submissions from \'%s\'" %
252   - self.popcon_dir)
253   - logging.info("Clusters will be placed at \'%s\'" % self.clusters_dir)
  226 + cl = cluster.HierarchicalClustering(data, lambda x,y:
  227 + distance(x.packages.keys(),
  228 + y.packages.keys()))
  229 + clusters = cl.getlevel(0.5)
  230 + for c in clusters:
  231 + print "cluster"
  232 + for submission in c:
  233 + print submission.user_id
254 234  
255   - for root, dirs, files in os.walk(self.popcon_dir):
256   - for submission_hash in files:
257   - s = PopconSubmission(submission_hash)
258   - submission_path = os.path.join(root, submission_hash)
259   - logging.debug("Parsing popcon submission \'%s\'" %
260   - submission_hash)
261   - for pkg, freq in self.parse_submission(submission_path):
262   - s.add_pkg(pkg)
263   - self.submissions.append(s)
264   -
265   - distanceFunction = JaccardDistance()
266   - # cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list))
267   - # clusters = cl.getlevel(0.5)
268   - # for c in clusters:
269   - # print "cluster"
270   - # for submission in c:
271   - # print submission.hash
272   - cl = KMedoidsClusteringPopcon(self.submissions, lambda x,y: \
273   - distanceFunction(x.pkgs_list,y.pkgs_list))
274   - #clusters = cl.getclusters(2)
275   - medoids = cl.getMedoids(2)
276   - print "medoids"
277   - for m in medoids:
278   - print m.hash
  235 + def kmedoids_clustering(self,data,clusters_dir,distance,k=10):
  236 + clusters = KMedoidsClustering(data,lambda x,y:
  237 + distance(x.packages.keys(),
  238 + y.packages.keys()))
  239 + medoids = clusters.getMedoids(2)
  240 + for submission in medoids:
  241 + shutil.copyfile(submission.path,os.path.join(clusters_dir,
  242 + submission.user_id))
279 243  
280   -class KMedoidsClusteringPopcon(cluster.KMeansClustering):
  244 +class KMedoidsClustering(cluster.KMeansClustering):
281 245  
282 246 def __init__(self,data,distance):
283   - if len(data)>100:
  247 + if len(data)<100:
  248 + data_sample = data
  249 + else:
284 250 data_sample = random.sample(data,100)
285 251 cluster.KMeansClustering.__init__(self, data_sample, distance)
286 252 self.distanceMatrix = {}
287 253 for submission in self._KMeansClustering__data:
288   - self.distanceMatrix[submission.hash] = {}
  254 + self.distanceMatrix[submission.user_id] = {}
289 255  
290 256 def loadDistanceMatrix(self,cluster):
291 257 for i in range(len(cluster)-1):
292 258 for j in range(i+1,len(cluster)):
293 259 try:
294   - d = self.distanceMatrix[cluster[i].hash][cluster[j].hash]
  260 + d = self.distanceMatrix[cluster[i].user_id][cluster[j].user_id]
295 261 logging.debug("Using d[%d,%d]" % (i,j))
296 262 except:
297 263 d = self.distance(cluster[i],cluster[j])
298   - self.distanceMatrix[cluster[i].hash][cluster[j].hash] = d
299   - self.distanceMatrix[cluster[j].hash][cluster[i].hash] = d
  264 + self.distanceMatrix[cluster[i].user_id][cluster[j].user_id] = d
  265 + self.distanceMatrix[cluster[j].user_id][cluster[i].user_id] = d
300 266 logging.debug("d[%d,%d] = %.2f" % (i,j,d))
301 267  
302 268 def getMedoid(self,cluster):
... ... @@ -308,22 +274,19 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering):
308 274 self.loadDistanceMatrix(cluster)
309 275 medoidDistance = sys.maxint
310 276 for i in range(len(cluster)):
311   - totalDistance = sum(self.distanceMatrix[cluster[i].hash].values())
  277 + totalDistance = sum(self.distanceMatrix[cluster[i].user_id].values())
312 278 print "totalDistance[",i,"]=",totalDistance
313 279 if totalDistance < medoidDistance:
314 280 medoidDistance = totalDistance
315 281 medoid = i
316 282 print "medoidDistance:",medoidDistance
317   - logging.debug("Cluster medoid: [%d] %s" % (medoid, cluster[medoid].hash))
  283 + logging.debug("Cluster medoid: [%d] %s" % (medoid,
  284 + cluster[medoid].user_id))
318 285 return cluster[medoid]
319 286  
320 287 def assign_item(self, item, origin):
321 288 """
322 289 Assigns an item from a given cluster to the closest located cluster
323   -
324   - PARAMETERS
325   - item - the item to be moved
326   - origin - the originating cluster
327 290 """
328 291 closest_cluster = origin
329 292 for cluster in self._KMeansClustering__clusters:
... ... @@ -332,7 +295,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering):
332 295  
333 296 if closest_cluster != origin:
334 297 self.move_item(item, origin, closest_cluster)
335   - logging.debug("Item changed cluster: %s" % item.hash)
  298 + logging.debug("Item changed cluster: %s" % item.user_id)
336 299 return True
337 300 else:
338 301 return False
... ... @@ -342,5 +305,5 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering):
342 305 Generate n clusters and return their medoids.
343 306 """
344 307 medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)]
345   - logging.info("Clustering completed and the following centroids were found: %s" % [c.hash for c in medoids])
  308 + logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids])
346 309 return medoids
... ...