Commit 353b42add083cccd2d18764ef0ec6e6b6b1878b6
1 parent
37e376c1
Exists in
master
and in
1 other branch
Data classes complete refactoring.
Showing
1 changed file
with
134 additions
and
171 deletions
Show diff stats
src/data.py
| @@ -22,17 +22,14 @@ __license__ = """ | @@ -22,17 +22,14 @@ __license__ = """ | ||
| 22 | import os | 22 | import os |
| 23 | import sys | 23 | import sys |
| 24 | import gc | 24 | import gc |
| 25 | -import re | ||
| 26 | import xapian | 25 | import xapian |
| 27 | -import axi | ||
| 28 | -from debian import debtags | ||
| 29 | import logging | 26 | import logging |
| 30 | -import hashlib | ||
| 31 | import random | 27 | import random |
| 28 | +import cluster | ||
| 29 | +import shutil | ||
| 32 | 30 | ||
| 33 | from error import Error | 31 | from error import Error |
| 34 | from singleton import Singleton | 32 | from singleton import Singleton |
| 35 | -import cluster | ||
| 36 | from dissimilarity import * | 33 | from dissimilarity import * |
| 37 | 34 | ||
| 38 | def axi_search_pkgs(axi,pkgs_list): | 35 | def axi_search_pkgs(axi,pkgs_list): |
| @@ -53,101 +50,114 @@ def axi_search_pkg_tags(axi,pkg): | @@ -53,101 +50,114 @@ def axi_search_pkg_tags(axi,pkg): | ||
| 53 | term.term.startswith("XT")] | 50 | term.term.startswith("XT")] |
| 54 | return tags | 51 | return tags |
| 55 | 52 | ||
| 53 | +def print_index(index): | ||
| 54 | + output = "\n---\n" + xapian.Database.__repr__(index) + "\n---\n" | ||
| 55 | + for term in index.allterms(): | ||
| 56 | + output += term.term+"\n" | ||
| 57 | + output += str([index.get_document(posting.docid).get_data() | ||
| 58 | + for posting in index.postlist(term.term)]) | ||
| 59 | + output += "\n---" | ||
| 60 | + return output | ||
| 61 | + | ||
| 56 | class SampleAptXapianIndex(xapian.WritableDatabase): | 62 | class SampleAptXapianIndex(xapian.WritableDatabase): |
| 57 | """ | 63 | """ |
| 58 | Sample data source for packages information, mainly useful for tests. | 64 | Sample data source for packages information, mainly useful for tests. |
| 59 | """ | 65 | """ |
| 60 | - def __init__(self,pkgs_list,axi): | ||
| 61 | - xapian.WritableDatabase.__init__(self,".sample_axi", | 66 | + def __init__(self,pkgs_list,axi,path): |
| 67 | + xapian.WritableDatabase.__init__(self,path, | ||
| 62 | xapian.DB_CREATE_OR_OVERWRITE) | 68 | xapian.DB_CREATE_OR_OVERWRITE) |
| 63 | sample = axi_search_pkgs(axi,pkgs_list) | 69 | sample = axi_search_pkgs(axi,pkgs_list) |
| 64 | - self.all_docs = [] | ||
| 65 | for package in sample: | 70 | for package in sample: |
| 66 | doc_id = self.add_document(axi.get_document(package.docid)) | 71 | doc_id = self.add_document(axi.get_document(package.docid)) |
| 67 | - self.all_docs.append(doc_id) | ||
| 68 | 72 | ||
| 69 | - def _print(self): | ||
| 70 | - print "---" | ||
| 71 | - print xapian.WritableDatabase.__repr__(self) | ||
| 72 | - print "---" | ||
| 73 | - for doc_id in self.all_docs: | ||
| 74 | - print [term.term for term in self.get_document(doc_id).termlist()] | ||
| 75 | - print "---" | 73 | + def __str__(self): |
| 74 | + return print_index(self) | ||
| 76 | 75 | ||
| 77 | class PopconSubmission(): | 76 | class PopconSubmission(): |
| 78 | - def __init__(self,submission_hash): | ||
| 79 | - self.hash = submission_hash | ||
| 80 | - self.pkgs_list = [] | 77 | + def __init__(self,path,user_id=0): |
| 78 | + self.packages = dict() | ||
| 79 | + self.path = path | ||
| 80 | + self.load() | ||
| 81 | + if user_id: | ||
| 82 | + self.user_id = user_id | ||
| 81 | 83 | ||
| 82 | - def add_pkg(self,pkg): | ||
| 83 | - self.pkgs_list.append(pkg) | 84 | + def __str__(self): |
| 85 | + output = "\nPopularity-contest submission ID "+self.user_id | ||
| 86 | + for pkg, weight in self.packages.items(): | ||
| 87 | + output += "\n "+pkg+": "+str(weight) | ||
| 88 | + return output | ||
| 84 | 89 | ||
| 85 | - def parse_submission(self,submission_path,binary=1): | 90 | + def load(self,binary=1): |
| 86 | """ | 91 | """ |
| 87 | Parse a popcon submission, generating the names of the valid packages | 92 | Parse a popcon submission, generating the names of the valid packages |
| 88 | in the vote. | 93 | in the vote. |
| 89 | """ | 94 | """ |
| 90 | - submission = open(submission_path) | ||
| 91 | - for line in submission: | ||
| 92 | - if not line.startswith("POPULARITY"): | ||
| 93 | - if not line.startswith("END-POPULARITY"): | ||
| 94 | - data = line[:-1].split(" ") | ||
| 95 | - if len(data) > 3: | ||
| 96 | - if binary: | ||
| 97 | - # every installed package has the same weight | ||
| 98 | - yield data[2], 1 | ||
| 99 | - elif data[3] == '<NOFILES>': | 95 | + with open(self.path) as submission: |
| 96 | + for line in submission: | ||
| 97 | + if line.startswith("POPULARITY"): | ||
| 98 | + self.user_id = line.split()[2].lstrip("ID:") | ||
| 99 | + elif not line.startswith("END-POPULARITY"): | ||
| 100 | + data = line.rstrip('\n').split() | ||
| 101 | + if len(data) > 2: | ||
| 102 | + pkg = data[2] | ||
| 103 | + if len(data) > 3: | ||
| 104 | + exec_file = data[3] | ||
| 105 | + # Binary weight | ||
| 106 | + if binary: | ||
| 107 | + self.packages[pkg] = 1 | ||
| 108 | + # Weights inherited from Enrico's anapop | ||
| 100 | # No executable files to track | 109 | # No executable files to track |
| 101 | - yield data[2], 1 | ||
| 102 | - elif len(data) == 4: | 110 | + elif exec_file == '<NOFILES>': |
| 111 | + self.packages[pkg] = 1 | ||
| 103 | # Recently used packages | 112 | # Recently used packages |
| 104 | - yield data[2], 10 | ||
| 105 | - elif data[4] == '<OLD>': | 113 | + elif len(data) == 4: |
| 114 | + self.packages[pkg] = 10 | ||
| 106 | # Unused packages | 115 | # Unused packages |
| 107 | - yield data[2], 3 | ||
| 108 | - elif data[4] == '<RECENT-CTIME>': | 116 | + elif data[4] == '<OLD>': |
| 117 | + self.packages[pkg] = 3 | ||
| 109 | # Recently installed packages | 118 | # Recently installed packages |
| 110 | - yield data[2], 8 | ||
| 111 | -class PopconXapianIndex(xapian.WritableDatabase,Singleton): | 119 | + elif data[4] == '<RECENT-CTIME>': |
| 120 | + self.packages[pkg] = 8 | ||
| 121 | + | ||
| 122 | +class PopconXapianIndex(xapian.WritableDatabase): | ||
| 112 | """ | 123 | """ |
| 113 | Data source for popcon submissions defined as a singleton xapian database. | 124 | Data source for popcon submissions defined as a singleton xapian database. |
| 114 | """ | 125 | """ |
| 115 | - def __init__(self,cfg): | 126 | + def __init__(self,cfg,reindex=0,recluster=0): |
| 116 | """ | 127 | """ |
| 117 | Set initial attributes. | 128 | Set initial attributes. |
| 118 | """ | 129 | """ |
| 119 | - self.path = os.path.expanduser(cfg.popcon_index) | ||
| 120 | - self.popcon_dir = os.path.expanduser(cfg.popcon_dir) | ||
| 121 | - #self.debtags_path = os.path.expanduser(cfg.tags_db) | ||
| 122 | self.axi = xapian.Database(cfg.axi) | 130 | self.axi = xapian.Database(cfg.axi) |
| 123 | - self.load_index() | 131 | + self.path = os.path.expanduser(cfg.popcon_index) |
| 132 | + if reindex or not self.load_index(): | ||
| 133 | + if not os.path.exists(cfg.popcon_dir): | ||
| 134 | + os.makedirs(cfg.popcon_dir) | ||
| 135 | + if not os.listdir(cfg.popcon_dir): | ||
| 136 | + logging.critical("Popcon dir seems to be empty.") | ||
| 137 | + raise Error | ||
| 138 | + if not cfg.clustering: | ||
| 139 | + self.source_dir = os.path.expanduser(cfg.popcon_dir) | ||
| 140 | + else: | ||
| 141 | + self.source_dir = os.path.expanduser(cfg.clusters_dir) | ||
| 142 | + if not os.path.exists(cfg.clusters_dir): | ||
| 143 | + os.makedirs(cfg.clusters_dir) | ||
| 144 | + if not os.listdir(cfg.clusters_dir): | ||
| 145 | + distance = JaccardDistance() | ||
| 146 | + logging.info("Clustering popcon submissions from \'%s\'" | ||
| 147 | + % cfg.popcon_dir) | ||
| 148 | + logging.info("Clusters will be placed at \'%s\'" | ||
| 149 | + % cfg.clusters_dir) | ||
| 150 | + data = self.get_submissions(cfg.popcon_dir) | ||
| 151 | + if cfg.clustering == "Hierarchical": | ||
| 152 | + self.hierarchical_clustering(data,cfg.clusters_dir, | ||
| 153 | + distance) | ||
| 154 | + else: | ||
| 155 | + self.kmedoids_clustering(data,cfg.clusters_dir, | ||
| 156 | + distance) | ||
| 157 | + self.build_index() | ||
| 124 | 158 | ||
| 125 | - def parse_submission(self,submission_path,binary=1): | ||
| 126 | - """ | ||
| 127 | - Parse a popcon submission, generating the names of the valid packages | ||
| 128 | - in the vote. | ||
| 129 | - """ | ||
| 130 | - submission = open(submission_path) | ||
| 131 | - for line in submission: | ||
| 132 | - if not line.startswith("POPULARITY"): | ||
| 133 | - if not line.startswith("END-POPULARITY"): | ||
| 134 | - data = line[:-1].split(" ") | ||
| 135 | - if len(data) > 3: | ||
| 136 | - if binary: | ||
| 137 | - # every installed package has the same weight | ||
| 138 | - yield data[2], 1 | ||
| 139 | - elif data[3] == '<NOFILES>': | ||
| 140 | - # No executable files to track | ||
| 141 | - yield data[2], 1 | ||
| 142 | - elif len(data) == 4: | ||
| 143 | - # Recently used packages | ||
| 144 | - yield data[2], 10 | ||
| 145 | - elif data[4] == '<OLD>': | ||
| 146 | - # Unused packages | ||
| 147 | - yield data[2], 3 | ||
| 148 | - elif data[4] == '<RECENT-CTIME>': | ||
| 149 | - # Recently installed packages | ||
| 150 | - yield data[2], 8 | 159 | + def __str__(self): |
| 160 | + return print_index(self) | ||
| 151 | 161 | ||
| 152 | def load_index(self): | 162 | def load_index(self): |
| 153 | """ | 163 | """ |
| @@ -159,19 +169,19 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): | @@ -159,19 +169,19 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): | ||
| 159 | xapian.Database.__init__(self,self.path) | 169 | xapian.Database.__init__(self,self.path) |
| 160 | except xapian.DatabaseError: | 170 | except xapian.DatabaseError: |
| 161 | logging.info("Could not open popcon index.") | 171 | logging.info("Could not open popcon index.") |
| 162 | - self.new_index() | 172 | + return 0 |
| 163 | 173 | ||
| 164 | - def new_index(self): | 174 | + def build_index(self): |
| 165 | """ | 175 | """ |
| 166 | - Create a xapian index for popcon submissions at 'popcon_dir' and | 176 | + Create a xapian index for popcon submissions at 'source_dir' and |
| 167 | place it at 'self.path'. | 177 | place it at 'self.path'. |
| 168 | """ | 178 | """ |
| 169 | - if not os.path.exists(self.path): | ||
| 170 | - os.makedirs(self.path) | 179 | + shutil.rmtree(self.path,1) |
| 180 | + os.makedirs(self.path) | ||
| 171 | 181 | ||
| 172 | try: | 182 | try: |
| 173 | logging.info("Indexing popcon submissions from \'%s\'" % | 183 | logging.info("Indexing popcon submissions from \'%s\'" % |
| 174 | - self.popcon_dir) | 184 | + self.source_dir) |
| 175 | logging.info("Creating new xapian index at \'%s\'" % | 185 | logging.info("Creating new xapian index at \'%s\'" % |
| 176 | self.path) | 186 | self.path) |
| 177 | xapian.WritableDatabase.__init__(self,self.path, | 187 | xapian.WritableDatabase.__init__(self,self.path, |
| @@ -180,123 +190,79 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): | @@ -180,123 +190,79 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): | ||
| 180 | logging.critical("Could not create popcon xapian index.") | 190 | logging.critical("Could not create popcon xapian index.") |
| 181 | raise Error | 191 | raise Error |
| 182 | 192 | ||
| 183 | - for root, dirs, files in os.walk(self.popcon_dir): | ||
| 184 | - for submission in files: | ||
| 185 | - submission_path = os.path.join(root, submission) | 193 | + for root, dirs, files in os.walk(self.source_dir): |
| 194 | + for popcon_file in files: | ||
| 195 | + submission = PopconSubmission(os.path.join(root, popcon_file)) | ||
| 186 | doc = xapian.Document() | 196 | doc = xapian.Document() |
| 187 | - doc.set_data(submission) | ||
| 188 | - logging.debug("Parsing popcon submission at \'%s\'" % | ||
| 189 | - submission_path) | ||
| 190 | - for pkg, freq in self.parse_submission(submission_path): | 197 | + doc.set_data(submission.user_id) |
| 198 | + logging.debug("Parsing popcon submission \'%s\'" % | ||
| 199 | + submission.user_id) | ||
| 200 | + for pkg, freq in submission.packages.items(): | ||
| 191 | doc.add_term("XP"+pkg,freq) | 201 | doc.add_term("XP"+pkg,freq) |
| 192 | for tag in axi_search_pkg_tags(self.axi,pkg): | 202 | for tag in axi_search_pkg_tags(self.axi,pkg): |
| 193 | - print tag | ||
| 194 | doc.add_term(tag,freq) | 203 | doc.add_term(tag,freq) |
| 195 | doc_id = self.add_document(doc) | 204 | doc_id = self.add_document(doc) |
| 196 | logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) | 205 | logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) |
| 197 | # python garbage collector | 206 | # python garbage collector |
| 198 | gc.collect() | 207 | gc.collect() |
| 199 | # flush to disk database changes | 208 | # flush to disk database changes |
| 200 | - self.flush() | 209 | + self.commit() |
| 201 | 210 | ||
| 202 | -class PopconClusteredData(Singleton): | ||
| 203 | - """ | ||
| 204 | - Data source for popcon submissions defined as a singleton xapian database. | ||
| 205 | - """ | ||
| 206 | - def __init__(self,cfg): | 211 | + def get_submissions(self,submissions_dir): |
| 207 | """ | 212 | """ |
| 208 | - Set initial attributes. | 213 | + Get popcon submissions from popcon_dir |
| 209 | """ | 214 | """ |
| 210 | - self.popcon_dir = os.path.expanduser(cfg.popcon_dir) | ||
| 211 | - self.clusters_dir = os.path.expanduser(cfg.clusters_dir) | ||
| 212 | - self.submissions = [] | ||
| 213 | - self.clustering() | 215 | + submissions = [] |
| 216 | + for root, dirs, files in os.walk(submissions_dir): | ||
| 217 | + for popcon_file in files: | ||
| 218 | + submission = PopconSubmission(os.path.join(root, popcon_file)) | ||
| 219 | + submissions.append(submission) | ||
| 220 | + return submissions | ||
| 214 | 221 | ||
| 215 | - def parse_submission(self,submission_path,binary=1): | ||
| 216 | - """ | ||
| 217 | - Parse a popcon submission, generating the names of the valid packages | ||
| 218 | - in the vote. | ||
| 219 | - """ | ||
| 220 | - submission_file = open(submission_path) | ||
| 221 | - for line in submission_file: | ||
| 222 | - if not line.startswith("POPULARITY"): | ||
| 223 | - if not line.startswith("END-POPULARITY"): | ||
| 224 | - data = line[:-1].split(" ") | ||
| 225 | - if len(data) > 3: | ||
| 226 | - if binary: | ||
| 227 | - # every installed package has the same weight | ||
| 228 | - yield data[2], 1 | ||
| 229 | - elif data[3] == '<NOFILES>': | ||
| 230 | - # No executable files to track | ||
| 231 | - yield data[2], 1 | ||
| 232 | - elif len(data) == 4: | ||
| 233 | - # Recently used packages | ||
| 234 | - yield data[2], 10 | ||
| 235 | - elif data[4] == '<OLD>': | ||
| 236 | - # Unused packages | ||
| 237 | - yield data[2], 3 | ||
| 238 | - elif data[4] == '<RECENT-CTIME>': | ||
| 239 | - # Recently installed packages | ||
| 240 | - yield data[2], 8 | ||
| 241 | - | ||
| 242 | - def clustering(self): | 222 | + def hierarchical_clustering(self,data,clusters_dir,distance,k=10): |
| 243 | """ | 223 | """ |
| 244 | - called by init | ||
| 245 | - Create a xapian index for popcon submissions at 'popcon_dir' and | ||
| 246 | - place it at 'self.path'. | 224 | + Select popcon submissions from popcon_dir and place them at clusters_dir |
| 247 | """ | 225 | """ |
| 248 | - if not os.path.exists(self.clusters_dir): | ||
| 249 | - os.makedirs(self.clusters_dir) | ||
| 250 | - | ||
| 251 | - logging.info("Clustering popcon submissions from \'%s\'" % | ||
| 252 | - self.popcon_dir) | ||
| 253 | - logging.info("Clusters will be placed at \'%s\'" % self.clusters_dir) | 226 | + cl = cluster.HierarchicalClustering(data, lambda x,y: |
| 227 | + distance(x.packages.keys(), | ||
| 228 | + y.packages.keys())) | ||
| 229 | + clusters = cl.getlevel(0.5) | ||
| 230 | + for c in clusters: | ||
| 231 | + print "cluster" | ||
| 232 | + for submission in c: | ||
| 233 | + print submission.user_id | ||
| 254 | 234 | ||
| 255 | - for root, dirs, files in os.walk(self.popcon_dir): | ||
| 256 | - for submission_hash in files: | ||
| 257 | - s = PopconSubmission(submission_hash) | ||
| 258 | - submission_path = os.path.join(root, submission_hash) | ||
| 259 | - logging.debug("Parsing popcon submission \'%s\'" % | ||
| 260 | - submission_hash) | ||
| 261 | - for pkg, freq in self.parse_submission(submission_path): | ||
| 262 | - s.add_pkg(pkg) | ||
| 263 | - self.submissions.append(s) | ||
| 264 | - | ||
| 265 | - distanceFunction = JaccardDistance() | ||
| 266 | - # cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | ||
| 267 | - # clusters = cl.getlevel(0.5) | ||
| 268 | - # for c in clusters: | ||
| 269 | - # print "cluster" | ||
| 270 | - # for submission in c: | ||
| 271 | - # print submission.hash | ||
| 272 | - cl = KMedoidsClusteringPopcon(self.submissions, lambda x,y: \ | ||
| 273 | - distanceFunction(x.pkgs_list,y.pkgs_list)) | ||
| 274 | - #clusters = cl.getclusters(2) | ||
| 275 | - medoids = cl.getMedoids(2) | ||
| 276 | - print "medoids" | ||
| 277 | - for m in medoids: | ||
| 278 | - print m.hash | 235 | + def kmedoids_clustering(self,data,clusters_dir,distance,k=10): |
| 236 | + clusters = KMedoidsClustering(data,lambda x,y: | ||
| 237 | + distance(x.packages.keys(), | ||
| 238 | + y.packages.keys())) | ||
| 239 | + medoids = clusters.getMedoids(2) | ||
| 240 | + for submission in medoids: | ||
| 241 | + shutil.copyfile(submission.path,os.path.join(clusters_dir, | ||
| 242 | + submission.user_id)) | ||
| 279 | 243 | ||
| 280 | -class KMedoidsClusteringPopcon(cluster.KMeansClustering): | 244 | +class KMedoidsClustering(cluster.KMeansClustering): |
| 281 | 245 | ||
| 282 | def __init__(self,data,distance): | 246 | def __init__(self,data,distance): |
| 283 | - if len(data)>100: | 247 | + if len(data)<100: |
| 248 | + data_sample = data | ||
| 249 | + else: | ||
| 284 | data_sample = random.sample(data,100) | 250 | data_sample = random.sample(data,100) |
| 285 | cluster.KMeansClustering.__init__(self, data_sample, distance) | 251 | cluster.KMeansClustering.__init__(self, data_sample, distance) |
| 286 | self.distanceMatrix = {} | 252 | self.distanceMatrix = {} |
| 287 | for submission in self._KMeansClustering__data: | 253 | for submission in self._KMeansClustering__data: |
| 288 | - self.distanceMatrix[submission.hash] = {} | 254 | + self.distanceMatrix[submission.user_id] = {} |
| 289 | 255 | ||
| 290 | def loadDistanceMatrix(self,cluster): | 256 | def loadDistanceMatrix(self,cluster): |
| 291 | for i in range(len(cluster)-1): | 257 | for i in range(len(cluster)-1): |
| 292 | for j in range(i+1,len(cluster)): | 258 | for j in range(i+1,len(cluster)): |
| 293 | try: | 259 | try: |
| 294 | - d = self.distanceMatrix[cluster[i].hash][cluster[j].hash] | 260 | + d = self.distanceMatrix[cluster[i].user_id][cluster[j].user_id] |
| 295 | logging.debug("Using d[%d,%d]" % (i,j)) | 261 | logging.debug("Using d[%d,%d]" % (i,j)) |
| 296 | except: | 262 | except: |
| 297 | d = self.distance(cluster[i],cluster[j]) | 263 | d = self.distance(cluster[i],cluster[j]) |
| 298 | - self.distanceMatrix[cluster[i].hash][cluster[j].hash] = d | ||
| 299 | - self.distanceMatrix[cluster[j].hash][cluster[i].hash] = d | 264 | + self.distanceMatrix[cluster[i].user_id][cluster[j].user_id] = d |
| 265 | + self.distanceMatrix[cluster[j].user_id][cluster[i].user_id] = d | ||
| 300 | logging.debug("d[%d,%d] = %.2f" % (i,j,d)) | 266 | logging.debug("d[%d,%d] = %.2f" % (i,j,d)) |
| 301 | 267 | ||
| 302 | def getMedoid(self,cluster): | 268 | def getMedoid(self,cluster): |
| @@ -308,22 +274,19 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): | @@ -308,22 +274,19 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): | ||
| 308 | self.loadDistanceMatrix(cluster) | 274 | self.loadDistanceMatrix(cluster) |
| 309 | medoidDistance = sys.maxint | 275 | medoidDistance = sys.maxint |
| 310 | for i in range(len(cluster)): | 276 | for i in range(len(cluster)): |
| 311 | - totalDistance = sum(self.distanceMatrix[cluster[i].hash].values()) | 277 | + totalDistance = sum(self.distanceMatrix[cluster[i].user_id].values()) |
| 312 | print "totalDistance[",i,"]=",totalDistance | 278 | print "totalDistance[",i,"]=",totalDistance |
| 313 | if totalDistance < medoidDistance: | 279 | if totalDistance < medoidDistance: |
| 314 | medoidDistance = totalDistance | 280 | medoidDistance = totalDistance |
| 315 | medoid = i | 281 | medoid = i |
| 316 | print "medoidDistance:",medoidDistance | 282 | print "medoidDistance:",medoidDistance |
| 317 | - logging.debug("Cluster medoid: [%d] %s" % (medoid, cluster[medoid].hash)) | 283 | + logging.debug("Cluster medoid: [%d] %s" % (medoid, |
| 284 | + cluster[medoid].user_id)) | ||
| 318 | return cluster[medoid] | 285 | return cluster[medoid] |
| 319 | 286 | ||
| 320 | def assign_item(self, item, origin): | 287 | def assign_item(self, item, origin): |
| 321 | """ | 288 | """ |
| 322 | Assigns an item from a given cluster to the closest located cluster | 289 | Assigns an item from a given cluster to the closest located cluster |
| 323 | - | ||
| 324 | - PARAMETERS | ||
| 325 | - item - the item to be moved | ||
| 326 | - origin - the originating cluster | ||
| 327 | """ | 290 | """ |
| 328 | closest_cluster = origin | 291 | closest_cluster = origin |
| 329 | for cluster in self._KMeansClustering__clusters: | 292 | for cluster in self._KMeansClustering__clusters: |
| @@ -332,7 +295,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): | @@ -332,7 +295,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): | ||
| 332 | 295 | ||
| 333 | if closest_cluster != origin: | 296 | if closest_cluster != origin: |
| 334 | self.move_item(item, origin, closest_cluster) | 297 | self.move_item(item, origin, closest_cluster) |
| 335 | - logging.debug("Item changed cluster: %s" % item.hash) | 298 | + logging.debug("Item changed cluster: %s" % item.user_id) |
| 336 | return True | 299 | return True |
| 337 | else: | 300 | else: |
| 338 | return False | 301 | return False |
| @@ -342,5 +305,5 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): | @@ -342,5 +305,5 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): | ||
| 342 | Generate n clusters and return their medoids. | 305 | Generate n clusters and return their medoids. |
| 343 | """ | 306 | """ |
| 344 | medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] | 307 | medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] |
| 345 | - logging.info("Clustering completed and the following centroids were found: %s" % [c.hash for c in medoids]) | 308 | + logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids]) |
| 346 | return medoids | 309 | return medoids |