Commit 98f794f3f6a75df5dfac85aa94962f90e9c895d8

Authored by Tássia Camões Araújo
1 parent 6c99e7cd
Exists in master and in 1 other branch add_vagrant

Clustering initial implementation.

Showing 2 changed files with 197 additions and 0 deletions   Show diff stats
src/clustering.py 0 → 100755
... ... @@ -0,0 +1,46 @@
  1 +#!/usr/bin/python
  2 +
  3 +# Clustering - a python script to perform clustering of popcon data.
  4 +#
  5 +# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com>
  6 +#
  7 +# This program is free software: you can redistribute it and/or modify
  8 +# it under the terms of the GNU General Public License as published by
  9 +# the Free Software Foundation, either version 3 of the License, or
  10 +# (at your option) any later version.
  11 +#
  12 +# This program is distributed in the hope that it will be useful,
  13 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 +# GNU General Public License for more details.
  16 +#
  17 +# You should have received a copy of the GNU General Public License
  18 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  19 +
  20 +import os
  21 +import sys
  22 +import logging
  23 +import datetime
  24 +from datetime import timedelta
  25 +
  26 +from config import *
  27 +from data import *
  28 +from similarity import *
  29 +from error import Error
  30 +
  31 +if __name__ == '__main__':
  32 + try:
  33 + cfg = Config()
  34 + begin_time = datetime.datetime.now()
  35 + logging.info("Clustering computation started at %s" % begin_time)
  36 +
  37 + cl = PopconClusteredData(cfg)
  38 +
  39 + end_time = datetime.datetime.now()
  40 + logging.info("Clustering computation completed at %s" % end_time)
  41 + delta = end_time - begin_time
  42 + logging.info("Time elapsed: %d seconds." % delta.seconds)
  43 +
  44 + except Error:
  45 + logging.critical("Aborting proccess. Use '--debug' for more details.")
  46 +
... ...
src/data.py
... ... @@ -29,6 +29,8 @@ import hashlib
29 29  
30 30 from error import Error
31 31 from singleton import Singleton
  32 +import cluster
  33 +from similarity import *
32 34  
33 35 class Item:
34 36 """
... ... @@ -258,3 +260,152 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton):
258 260 gc.collect()
259 261 # flush to disk database changes
260 262 self.flush()
  263 +
  264 +class PopconSubmission():
  265 + def __init__(self,submission_hash):
  266 + self.hash = submission_hash
  267 + self.pkgs_list = []
  268 +
  269 + def add_pkg(self,pkg):
  270 + self.pkgs_list.append(pkg)
  271 +
  272 +class PopconClusteredData(Singleton):
  273 + """
  274 + Data source for popcon submissions defined as a singleton xapian database.
  275 + """
  276 + def __init__(self,cfg):
  277 + """
  278 + Set initial attributes.
  279 + """
  280 + self.popcon_dir = os.path.expanduser(cfg.popcon_dir)
  281 + self.clusters_dir = os.path.expanduser(cfg.clusters_dir)
  282 + self.submissions = []
  283 + self.clustering()
  284 +
  285 + def parse_submission(self,submission_path,binary=1):
  286 + """
  287 + Parse a popcon submission, generating the names of the valid packages
  288 + in the vote.
  289 + """
  290 + submission_file = open(submission_path)
  291 + for line in submission_file:
  292 + if not line.startswith("POPULARITY"):
  293 + if not line.startswith("END-POPULARITY"):
  294 + data = line[:-1].split(" ")
  295 + if len(data) > 3:
  296 + if binary:
  297 + # every installed package has the same weight
  298 + yield data[2], 1
  299 + elif data[3] == '<NOFILES>':
  300 + # No executable files to track
  301 + yield data[2], 1
  302 + elif len(data) == 4:
  303 + # Recently used packages
  304 + yield data[2], 10
  305 + elif data[4] == '<OLD>':
  306 + # Unused packages
  307 + yield data[2], 3
  308 + elif data[4] == '<RECENT-CTIME>':
  309 + # Recently installed packages
  310 + yield data[2], 8
  311 +
  312 + def clustering(self):
  313 + """
  314 + called by init
  315 + Create a xapian index for popcon submissions at 'popcon_dir' and
  316 + place it at 'self.path'.
  317 + """
  318 + if not os.path.exists(self.clusters_dir):
  319 + os.makedirs(self.clusters_dir)
  320 +
  321 + logging.info("Clustering popcon submissions from \'%s\'" %
  322 + self.popcon_dir)
  323 + logging.info("Clusters will be placed at \'%s\'" % self.clusters_dir)
  324 +
  325 + for root, dirs, files in os.walk(self.popcon_dir):
  326 + for submission_hash in files:
  327 + s = PopconSubmission(submission_hash)
  328 + submission_path = os.path.join(root, submission_hash)
  329 + logging.debug("Parsing popcon submission \'%s\'" %
  330 + submission_hash)
  331 + for pkg, freq in self.parse_submission(submission_path):
  332 + s.add_pkg(pkg)
  333 + self.submissions.append(s)
  334 +
  335 + distanceFunction = JaccardIndex()
  336 + cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list))
  337 + clusters = cl.getlevel(0.5)
  338 + for c in clusters:
  339 + print "cluster"
  340 + for submission in c:
  341 + print submission.hash
  342 + #cl = KMeansClusteringPopcon(self.submissions,
  343 + # lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list))
  344 + #clusters = cl.getclusters(2)
  345 + #medoids = cl.getMedoids(2)
  346 +
  347 +class KMedoidsClusteringPopcon(cluster.KMeansClustering):
  348 +
  349 + def __init__(self,data,distance):
  350 + cluster.KMeansClustering.__init__(self, data, distance)
  351 + self.distanceMatrix = {}
  352 + for submission in self._KMeansClustering__data:
  353 + self.distanceMatrix[submission.hash] = {}
  354 +
  355 + def loadDistanceMatrix(self,cluster):
  356 + for i in range(len(cluster)-1):
  357 + for j in range(i+1,len(cluster)):
  358 + try:
  359 + d = self.distanceMatrix[cluster[i].hash][cluster[j].hash]
  360 + logging.debug("Using d[%d,%d]" % (i,j))
  361 + except:
  362 + d = self.distance(cluster[i],cluster[j])
  363 + self.distanceMatrix[cluster[i].hash][cluster[j].hash] = d
  364 + self.distanceMatrix[cluster[j].hash][cluster[i].hash] = d
  365 + logging.debug("d[%d,%d] = %.2f" % (i,j,d))
  366 +
  367 + def getMedoid(self,cluster):
  368 + """
  369 + Return the medoid popcon submission of a given a cluster, based on
  370 + the distance function.
  371 + """
  372 + logging.debug("Cluster size: %d" % len(cluster))
  373 + self.loadDistanceMatrix(cluster)
  374 + medoidDistance = sys.maxint
  375 + for i in range(len(cluster)):
  376 + totalDistance = sum(self.distanceMatrix[cluster[i].hash].values())
  377 + print "totalDistance[",i,"]=",totalDistance
  378 + if totalDistance < centroidDistance:
  379 + medoidDistance = totalDistance
  380 + medoid = i
  381 + print "medoidDistance:",medoidDistance
  382 + logging.debug("Cluster medoid: [%d] %s" % (medoid, cluster[medoid].hash))
  383 + return cluster[medoid]
  384 +
  385 + def assign_item(self, item, origin):
  386 + """
  387 + Assigns an item from a given cluster to the closest located cluster
  388 +
  389 + PARAMETERS
  390 + item - the item to be moved
  391 + origin - the originating cluster
  392 + """
  393 + closest_cluster = origin
  394 + for cluster in self._KMeansClustering__clusters:
  395 + if self.distance(item,self.getMedoid(cluster)) < self.distance(item,self.getMedoid(closest_cluster)):
  396 + closest_cluster = cluster
  397 +
  398 + if closest_cluster != origin:
  399 + self.move_item(item, origin, closest_cluster)
  400 + logging.debug("Item changed cluster: %s" % item.hash)
  401 + return True
  402 + else:
  403 + return False
  404 +
  405 + def getMedoids(self,n):
  406 + """
  407 + Generate n clusters and return their medoids.
  408 + """
  409 + medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)]
  410 + logging.info("Clustering completed and the following centroids were found: %s" % [c.hash for c in medoids])
  411 + return medoids
... ...