Commit 98f794f3f6a75df5dfac85aa94962f90e9c895d8
1 parent
6c99e7cd
Exists in
master
and in
1 other branch
Clustering initial implementation.
Showing
2 changed files
with
197 additions
and
0 deletions
Show diff stats
| @@ -0,0 +1,46 @@ | @@ -0,0 +1,46 @@ | ||
| 1 | +#!/usr/bin/python | ||
| 2 | + | ||
| 3 | +# Clustering - a python script to perform clustering of popcon data. | ||
| 4 | +# | ||
| 5 | +# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com> | ||
| 6 | +# | ||
| 7 | +# This program is free software: you can redistribute it and/or modify | ||
| 8 | +# it under the terms of the GNU General Public License as published by | ||
| 9 | +# the Free Software Foundation, either version 3 of the License, or | ||
| 10 | +# (at your option) any later version. | ||
| 11 | +# | ||
| 12 | +# This program is distributed in the hope that it will be useful, | ||
| 13 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 14 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 15 | +# GNU General Public License for more details. | ||
| 16 | +# | ||
| 17 | +# You should have received a copy of the GNU General Public License | ||
| 18 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
| 19 | + | ||
| 20 | +import os | ||
| 21 | +import sys | ||
| 22 | +import logging | ||
| 23 | +import datetime | ||
| 24 | +from datetime import timedelta | ||
| 25 | + | ||
| 26 | +from config import * | ||
| 27 | +from data import * | ||
| 28 | +from similarity import * | ||
| 29 | +from error import Error | ||
| 30 | + | ||
| 31 | +if __name__ == '__main__': | ||
| 32 | + try: | ||
| 33 | + cfg = Config() | ||
| 34 | + begin_time = datetime.datetime.now() | ||
| 35 | + logging.info("Clustering computation started at %s" % begin_time) | ||
| 36 | + | ||
| 37 | + cl = PopconClusteredData(cfg) | ||
| 38 | + | ||
| 39 | + end_time = datetime.datetime.now() | ||
| 40 | + logging.info("Clustering computation completed at %s" % end_time) | ||
| 41 | + delta = end_time - begin_time | ||
| 42 | + logging.info("Time elapsed: %d seconds." % delta.seconds) | ||
| 43 | + | ||
| 44 | + except Error: | ||
| 45 | + logging.critical("Aborting proccess. Use '--debug' for more details.") | ||
| 46 | + |
src/data.py
| @@ -29,6 +29,8 @@ import hashlib | @@ -29,6 +29,8 @@ import hashlib | ||
| 29 | 29 | ||
| 30 | from error import Error | 30 | from error import Error |
| 31 | from singleton import Singleton | 31 | from singleton import Singleton |
| 32 | +import cluster | ||
| 33 | +from similarity import * | ||
| 32 | 34 | ||
| 33 | class Item: | 35 | class Item: |
| 34 | """ | 36 | """ |
| @@ -258,3 +260,152 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): | @@ -258,3 +260,152 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): | ||
| 258 | gc.collect() | 260 | gc.collect() |
| 259 | # flush to disk database changes | 261 | # flush to disk database changes |
| 260 | self.flush() | 262 | self.flush() |
| 263 | + | ||
| 264 | +class PopconSubmission(): | ||
| 265 | + def __init__(self,submission_hash): | ||
| 266 | + self.hash = submission_hash | ||
| 267 | + self.pkgs_list = [] | ||
| 268 | + | ||
| 269 | + def add_pkg(self,pkg): | ||
| 270 | + self.pkgs_list.append(pkg) | ||
| 271 | + | ||
| 272 | +class PopconClusteredData(Singleton): | ||
| 273 | + """ | ||
| 274 | + Data source for popcon submissions defined as a singleton xapian database. | ||
| 275 | + """ | ||
| 276 | + def __init__(self,cfg): | ||
| 277 | + """ | ||
| 278 | + Set initial attributes. | ||
| 279 | + """ | ||
| 280 | + self.popcon_dir = os.path.expanduser(cfg.popcon_dir) | ||
| 281 | + self.clusters_dir = os.path.expanduser(cfg.clusters_dir) | ||
| 282 | + self.submissions = [] | ||
| 283 | + self.clustering() | ||
| 284 | + | ||
| 285 | + def parse_submission(self,submission_path,binary=1): | ||
| 286 | + """ | ||
| 287 | + Parse a popcon submission, generating the names of the valid packages | ||
| 288 | + in the vote. | ||
| 289 | + """ | ||
| 290 | + submission_file = open(submission_path) | ||
| 291 | + for line in submission_file: | ||
| 292 | + if not line.startswith("POPULARITY"): | ||
| 293 | + if not line.startswith("END-POPULARITY"): | ||
| 294 | + data = line[:-1].split(" ") | ||
| 295 | + if len(data) > 3: | ||
| 296 | + if binary: | ||
| 297 | + # every installed package has the same weight | ||
| 298 | + yield data[2], 1 | ||
| 299 | + elif data[3] == '<NOFILES>': | ||
| 300 | + # No executable files to track | ||
| 301 | + yield data[2], 1 | ||
| 302 | + elif len(data) == 4: | ||
| 303 | + # Recently used packages | ||
| 304 | + yield data[2], 10 | ||
| 305 | + elif data[4] == '<OLD>': | ||
| 306 | + # Unused packages | ||
| 307 | + yield data[2], 3 | ||
| 308 | + elif data[4] == '<RECENT-CTIME>': | ||
| 309 | + # Recently installed packages | ||
| 310 | + yield data[2], 8 | ||
| 311 | + | ||
| 312 | + def clustering(self): | ||
| 313 | + """ | ||
| 314 | + called by init | ||
| 315 | + Create a xapian index for popcon submissions at 'popcon_dir' and | ||
| 316 | + place it at 'self.path'. | ||
| 317 | + """ | ||
| 318 | + if not os.path.exists(self.clusters_dir): | ||
| 319 | + os.makedirs(self.clusters_dir) | ||
| 320 | + | ||
| 321 | + logging.info("Clustering popcon submissions from \'%s\'" % | ||
| 322 | + self.popcon_dir) | ||
| 323 | + logging.info("Clusters will be placed at \'%s\'" % self.clusters_dir) | ||
| 324 | + | ||
| 325 | + for root, dirs, files in os.walk(self.popcon_dir): | ||
| 326 | + for submission_hash in files: | ||
| 327 | + s = PopconSubmission(submission_hash) | ||
| 328 | + submission_path = os.path.join(root, submission_hash) | ||
| 329 | + logging.debug("Parsing popcon submission \'%s\'" % | ||
| 330 | + submission_hash) | ||
| 331 | + for pkg, freq in self.parse_submission(submission_path): | ||
| 332 | + s.add_pkg(pkg) | ||
| 333 | + self.submissions.append(s) | ||
| 334 | + | ||
| 335 | + distanceFunction = JaccardIndex() | ||
| 336 | + cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | ||
| 337 | + clusters = cl.getlevel(0.5) | ||
| 338 | + for c in clusters: | ||
| 339 | + print "cluster" | ||
| 340 | + for submission in c: | ||
| 341 | + print submission.hash | ||
| 342 | + #cl = KMeansClusteringPopcon(self.submissions, | ||
| 343 | + # lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | ||
| 344 | + #clusters = cl.getclusters(2) | ||
| 345 | + #medoids = cl.getMedoids(2) | ||
| 346 | + | ||
| 347 | +class KMedoidsClusteringPopcon(cluster.KMeansClustering): | ||
| 348 | + | ||
| 349 | + def __init__(self,data,distance): | ||
| 350 | + cluster.KMeansClustering.__init__(self, data, distance) | ||
| 351 | + self.distanceMatrix = {} | ||
| 352 | + for submission in self._KMeansClustering__data: | ||
| 353 | + self.distanceMatrix[submission.hash] = {} | ||
| 354 | + | ||
| 355 | + def loadDistanceMatrix(self,cluster): | ||
| 356 | + for i in range(len(cluster)-1): | ||
| 357 | + for j in range(i+1,len(cluster)): | ||
| 358 | + try: | ||
| 359 | + d = self.distanceMatrix[cluster[i].hash][cluster[j].hash] | ||
| 360 | + logging.debug("Using d[%d,%d]" % (i,j)) | ||
| 361 | + except: | ||
| 362 | + d = self.distance(cluster[i],cluster[j]) | ||
| 363 | + self.distanceMatrix[cluster[i].hash][cluster[j].hash] = d | ||
| 364 | + self.distanceMatrix[cluster[j].hash][cluster[i].hash] = d | ||
| 365 | + logging.debug("d[%d,%d] = %.2f" % (i,j,d)) | ||
| 366 | + | ||
| 367 | + def getMedoid(self,cluster): | ||
| 368 | + """ | ||
| 369 | + Return the medoid popcon submission of a given a cluster, based on | ||
| 370 | + the distance function. | ||
| 371 | + """ | ||
| 372 | + logging.debug("Cluster size: %d" % len(cluster)) | ||
| 373 | + self.loadDistanceMatrix(cluster) | ||
| 374 | + medoidDistance = sys.maxint | ||
| 375 | + for i in range(len(cluster)): | ||
| 376 | + totalDistance = sum(self.distanceMatrix[cluster[i].hash].values()) | ||
| 377 | + print "totalDistance[",i,"]=",totalDistance | ||
| 378 | + if totalDistance < centroidDistance: | ||
| 379 | + medoidDistance = totalDistance | ||
| 380 | + medoid = i | ||
| 381 | + print "medoidDistance:",medoidDistance | ||
| 382 | + logging.debug("Cluster medoid: [%d] %s" % (medoid, cluster[medoid].hash)) | ||
| 383 | + return cluster[medoid] | ||
| 384 | + | ||
| 385 | + def assign_item(self, item, origin): | ||
| 386 | + """ | ||
| 387 | + Assigns an item from a given cluster to the closest located cluster | ||
| 388 | + | ||
| 389 | + PARAMETERS | ||
| 390 | + item - the item to be moved | ||
| 391 | + origin - the originating cluster | ||
| 392 | + """ | ||
| 393 | + closest_cluster = origin | ||
| 394 | + for cluster in self._KMeansClustering__clusters: | ||
| 395 | + if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)): | ||
| 396 | + closest_cluster = cluster | ||
| 397 | + | ||
| 398 | + if closest_cluster != origin: | ||
| 399 | + self.move_item(item, origin, closest_cluster) | ||
| 400 | + logging.debug("Item changed cluster: %s" % item.hash) | ||
| 401 | + return True | ||
| 402 | + else: | ||
| 403 | + return False | ||
| 404 | + | ||
| 405 | + def getMedoids(self,n): | ||
| 406 | + """ | ||
| 407 | + Generate n clusters and return their medoids. | ||
| 408 | + """ | ||
| 409 | + medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | ||
| 410 | + logging.info("Clustering completed and the following centroids were found: %s" % [c.hash for c in medoids]) | ||
| 411 | + return medoids |