Commit 98f794f3f6a75df5dfac85aa94962f90e9c895d8
1 parent
6c99e7cd
Exists in
master
and in
1 other branch
Clustering initial implementation.
Showing
2 changed files
with
197 additions
and
0 deletions
Show diff stats
... | ... | @@ -0,0 +1,46 @@ |
1 | +#!/usr/bin/python | |
2 | + | |
3 | +# Clustering - a python script to perform clustering of popcon data. | |
4 | +# | |
5 | +# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com> | |
6 | +# | |
7 | +# This program is free software: you can redistribute it and/or modify | |
8 | +# it under the terms of the GNU General Public License as published by | |
9 | +# the Free Software Foundation, either version 3 of the License, or | |
10 | +# (at your option) any later version. | |
11 | +# | |
12 | +# This program is distributed in the hope that it will be useful, | |
13 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | +# GNU General Public License for more details. | |
16 | +# | |
17 | +# You should have received a copy of the GNU General Public License | |
18 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
19 | + | |
20 | +import os | |
21 | +import sys | |
22 | +import logging | |
23 | +import datetime | |
24 | +from datetime import timedelta | |
25 | + | |
26 | +from config import * | |
27 | +from data import * | |
28 | +from similarity import * | |
29 | +from error import Error | |
30 | + | |
31 | +if __name__ == '__main__': | |
32 | + try: | |
33 | + cfg = Config() | |
34 | + begin_time = datetime.datetime.now() | |
35 | + logging.info("Clustering computation started at %s" % begin_time) | |
36 | + | |
37 | + cl = PopconClusteredData(cfg) | |
38 | + | |
39 | + end_time = datetime.datetime.now() | |
40 | + logging.info("Clustering computation completed at %s" % end_time) | |
41 | + delta = end_time - begin_time | |
42 | + logging.info("Time elapsed: %d seconds." % delta.seconds) | |
43 | + | |
44 | + except Error: | |
45 | + logging.critical("Aborting proccess. Use '--debug' for more details.") | |
46 | + | ... | ... |
src/data.py
... | ... | @@ -29,6 +29,8 @@ import hashlib |
29 | 29 | |
30 | 30 | from error import Error |
31 | 31 | from singleton import Singleton |
32 | +import cluster | |
33 | +from similarity import * | |
32 | 34 | |
33 | 35 | class Item: |
34 | 36 | """ |
... | ... | @@ -258,3 +260,152 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
258 | 260 | gc.collect() |
259 | 261 | # flush to disk database changes |
260 | 262 | self.flush() |
263 | + | |
264 | +class PopconSubmission(): | |
265 | + def __init__(self,submission_hash): | |
266 | + self.hash = submission_hash | |
267 | + self.pkgs_list = [] | |
268 | + | |
269 | + def add_pkg(self,pkg): | |
270 | + self.pkgs_list.append(pkg) | |
271 | + | |
272 | +class PopconClusteredData(Singleton): | |
273 | + """ | |
274 | + Data source for popcon submissions defined as a singleton xapian database. | |
275 | + """ | |
276 | + def __init__(self,cfg): | |
277 | + """ | |
278 | + Set initial attributes. | |
279 | + """ | |
280 | + self.popcon_dir = os.path.expanduser(cfg.popcon_dir) | |
281 | + self.clusters_dir = os.path.expanduser(cfg.clusters_dir) | |
282 | + self.submissions = [] | |
283 | + self.clustering() | |
284 | + | |
285 | + def parse_submission(self,submission_path,binary=1): | |
286 | + """ | |
287 | + Parse a popcon submission, generating the names of the valid packages | |
288 | + in the vote. | |
289 | + """ | |
290 | + submission_file = open(submission_path) | |
291 | + for line in submission_file: | |
292 | + if not line.startswith("POPULARITY"): | |
293 | + if not line.startswith("END-POPULARITY"): | |
294 | + data = line[:-1].split(" ") | |
295 | + if len(data) > 3: | |
296 | + if binary: | |
297 | + # every installed package has the same weight | |
298 | + yield data[2], 1 | |
299 | + elif data[3] == '<NOFILES>': | |
300 | + # No executable files to track | |
301 | + yield data[2], 1 | |
302 | + elif len(data) == 4: | |
303 | + # Recently used packages | |
304 | + yield data[2], 10 | |
305 | + elif data[4] == '<OLD>': | |
306 | + # Unused packages | |
307 | + yield data[2], 3 | |
308 | + elif data[4] == '<RECENT-CTIME>': | |
309 | + # Recently installed packages | |
310 | + yield data[2], 8 | |
311 | + | |
312 | + def clustering(self): | |
313 | + """ | |
314 | + called by init | |
315 | + Create a xapian index for popcon submissions at 'popcon_dir' and | |
316 | + place it at 'self.path'. | |
317 | + """ | |
318 | + if not os.path.exists(self.clusters_dir): | |
319 | + os.makedirs(self.clusters_dir) | |
320 | + | |
321 | + logging.info("Clustering popcon submissions from \'%s\'" % | |
322 | + self.popcon_dir) | |
323 | + logging.info("Clusters will be placed at \'%s\'" % self.clusters_dir) | |
324 | + | |
325 | + for root, dirs, files in os.walk(self.popcon_dir): | |
326 | + for submission_hash in files: | |
327 | + s = PopconSubmission(submission_hash) | |
328 | + submission_path = os.path.join(root, submission_hash) | |
329 | + logging.debug("Parsing popcon submission \'%s\'" % | |
330 | + submission_hash) | |
331 | + for pkg, freq in self.parse_submission(submission_path): | |
332 | + s.add_pkg(pkg) | |
333 | + self.submissions.append(s) | |
334 | + | |
335 | + distanceFunction = JaccardIndex() | |
336 | + cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | |
337 | + clusters = cl.getlevel(0.5) | |
338 | + for c in clusters: | |
339 | + print "cluster" | |
340 | + for submission in c: | |
341 | + print submission.hash | |
342 | + #cl = KMeansClusteringPopcon(self.submissions, | |
343 | + # lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | |
344 | + #clusters = cl.getclusters(2) | |
345 | + #medoids = cl.getMedoids(2) | |
346 | + | |
347 | +class KMedoidsClusteringPopcon(cluster.KMeansClustering): | |
348 | + | |
349 | + def __init__(self,data,distance): | |
350 | + cluster.KMeansClustering.__init__(self, data, distance) | |
351 | + self.distanceMatrix = {} | |
352 | + for submission in self._KMeansClustering__data: | |
353 | + self.distanceMatrix[submission.hash] = {} | |
354 | + | |
355 | + def loadDistanceMatrix(self,cluster): | |
356 | + for i in range(len(cluster)-1): | |
357 | + for j in range(i+1,len(cluster)): | |
358 | + try: | |
359 | + d = self.distanceMatrix[cluster[i].hash][cluster[j].hash] | |
360 | + logging.debug("Using d[%d,%d]" % (i,j)) | |
361 | + except: | |
362 | + d = self.distance(cluster[i],cluster[j]) | |
363 | + self.distanceMatrix[cluster[i].hash][cluster[j].hash] = d | |
364 | + self.distanceMatrix[cluster[j].hash][cluster[i].hash] = d | |
365 | + logging.debug("d[%d,%d] = %.2f" % (i,j,d)) | |
366 | + | |
367 | + def getMedoid(self,cluster): | |
368 | + """ | |
369 | + Return the medoid popcon submission of a given a cluster, based on | |
370 | + the distance function. | |
371 | + """ | |
372 | + logging.debug("Cluster size: %d" % len(cluster)) | |
373 | + self.loadDistanceMatrix(cluster) | |
374 | + medoidDistance = sys.maxint | |
375 | + for i in range(len(cluster)): | |
376 | + totalDistance = sum(self.distanceMatrix[cluster[i].hash].values()) | |
377 | + print "totalDistance[",i,"]=",totalDistance | |
378 | + if totalDistance < centroidDistance: | |
379 | + medoidDistance = totalDistance | |
380 | + medoid = i | |
381 | + print "medoidDistance:",medoidDistance | |
382 | + logging.debug("Cluster medoid: [%d] %s" % (medoid, cluster[medoid].hash)) | |
383 | + return cluster[medoid] | |
384 | + | |
385 | + def assign_item(self, item, origin): | |
386 | + """ | |
387 | + Assigns an item from a given cluster to the closest located cluster | |
388 | + | |
389 | + PARAMETERS | |
390 | + item - the item to be moved | |
391 | + origin - the originating cluster | |
392 | + """ | |
393 | + closest_cluster = origin | |
394 | + for cluster in self._KMeansClustering__clusters: | |
395 | + if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)): | |
396 | + closest_cluster = cluster | |
397 | + | |
398 | + if closest_cluster != origin: | |
399 | + self.move_item(item, origin, closest_cluster) | |
400 | + logging.debug("Item changed cluster: %s" % item.hash) | |
401 | + return True | |
402 | + else: | |
403 | + return False | |
404 | + | |
405 | + def getMedoids(self,n): | |
406 | + """ | |
407 | + Generate n clusters and return their medoids. | |
408 | + """ | |
409 | + medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | |
410 | + logging.info("Clustering completed and the following centroids were found: %s" % [c.hash for c in medoids]) | |
411 | + return medoids | ... | ... |