Commit 353b42add083cccd2d18764ef0ec6e6b6b1878b6
1 parent
37e376c1
Exists in
master
and in
1 other branch
Data classes complete refactoring.
Showing
1 changed file
with
134 additions
and
171 deletions
Show diff stats
src/data.py
| ... | ... | @@ -22,17 +22,14 @@ __license__ = """ |
| 22 | 22 | import os |
| 23 | 23 | import sys |
| 24 | 24 | import gc |
| 25 | -import re | |
| 26 | 25 | import xapian |
| 27 | -import axi | |
| 28 | -from debian import debtags | |
| 29 | 26 | import logging |
| 30 | -import hashlib | |
| 31 | 27 | import random |
| 28 | +import cluster | |
| 29 | +import shutil | |
| 32 | 30 | |
| 33 | 31 | from error import Error |
| 34 | 32 | from singleton import Singleton |
| 35 | -import cluster | |
| 36 | 33 | from dissimilarity import * |
| 37 | 34 | |
| 38 | 35 | def axi_search_pkgs(axi,pkgs_list): |
| ... | ... | @@ -53,101 +50,114 @@ def axi_search_pkg_tags(axi,pkg): |
| 53 | 50 | term.term.startswith("XT")] |
| 54 | 51 | return tags |
| 55 | 52 | |
| 53 | +def print_index(index): | |
| 54 | + output = "\n---\n" + xapian.Database.__repr__(index) + "\n---\n" | |
| 55 | + for term in index.allterms(): | |
| 56 | + output += term.term+"\n" | |
| 57 | + output += str([index.get_document(posting.docid).get_data() | |
| 58 | + for posting in index.postlist(term.term)]) | |
| 59 | + output += "\n---" | |
| 60 | + return output | |
| 61 | + | |
| 56 | 62 | class SampleAptXapianIndex(xapian.WritableDatabase): |
| 57 | 63 | """ |
| 58 | 64 | Sample data source for packages information, mainly useful for tests. |
| 59 | 65 | """ |
| 60 | - def __init__(self,pkgs_list,axi): | |
| 61 | - xapian.WritableDatabase.__init__(self,".sample_axi", | |
| 66 | + def __init__(self,pkgs_list,axi,path): | |
| 67 | + xapian.WritableDatabase.__init__(self,path, | |
| 62 | 68 | xapian.DB_CREATE_OR_OVERWRITE) |
| 63 | 69 | sample = axi_search_pkgs(axi,pkgs_list) |
| 64 | - self.all_docs = [] | |
| 65 | 70 | for package in sample: |
| 66 | 71 | doc_id = self.add_document(axi.get_document(package.docid)) |
| 67 | - self.all_docs.append(doc_id) | |
| 68 | 72 | |
| 69 | - def _print(self): | |
| 70 | - print "---" | |
| 71 | - print xapian.WritableDatabase.__repr__(self) | |
| 72 | - print "---" | |
| 73 | - for doc_id in self.all_docs: | |
| 74 | - print [term.term for term in self.get_document(doc_id).termlist()] | |
| 75 | - print "---" | |
| 73 | + def __str__(self): | |
| 74 | + return print_index(self) | |
| 76 | 75 | |
| 77 | 76 | class PopconSubmission(): |
| 78 | - def __init__(self,submission_hash): | |
| 79 | - self.hash = submission_hash | |
| 80 | - self.pkgs_list = [] | |
| 77 | + def __init__(self,path,user_id=0): | |
| 78 | + self.packages = dict() | |
| 79 | + self.path = path | |
| 80 | + self.load() | |
| 81 | + if user_id: | |
| 82 | + self.user_id = user_id | |
| 81 | 83 | |
| 82 | - def add_pkg(self,pkg): | |
| 83 | - self.pkgs_list.append(pkg) | |
| 84 | + def __str__(self): | |
| 85 | + output = "\nPopularity-contest submission ID "+self.user_id | |
| 86 | + for pkg, weight in self.packages.items(): | |
| 87 | + output += "\n "+pkg+": "+str(weight) | |
| 88 | + return output | |
| 84 | 89 | |
| 85 | - def parse_submission(self,submission_path,binary=1): | |
| 90 | + def load(self,binary=1): | |
| 86 | 91 | """ |
| 87 | 92 | Parse a popcon submission, generating the names of the valid packages |
| 88 | 93 | in the vote. |
| 89 | 94 | """ |
| 90 | - submission = open(submission_path) | |
| 91 | - for line in submission: | |
| 92 | - if not line.startswith("POPULARITY"): | |
| 93 | - if not line.startswith("END-POPULARITY"): | |
| 94 | - data = line[:-1].split(" ") | |
| 95 | - if len(data) > 3: | |
| 96 | - if binary: | |
| 97 | - # every installed package has the same weight | |
| 98 | - yield data[2], 1 | |
| 99 | - elif data[3] == '<NOFILES>': | |
| 95 | + with open(self.path) as submission: | |
| 96 | + for line in submission: | |
| 97 | + if line.startswith("POPULARITY"): | |
| 98 | + self.user_id = line.split()[2].lstrip("ID:") | |
| 99 | + elif not line.startswith("END-POPULARITY"): | |
| 100 | + data = line.rstrip('\n').split() | |
| 101 | + if len(data) > 2: | |
| 102 | + pkg = data[2] | |
| 103 | + if len(data) > 3: | |
| 104 | + exec_file = data[3] | |
| 105 | + # Binary weight | |
| 106 | + if binary: | |
| 107 | + self.packages[pkg] = 1 | |
| 108 | + # Weights inherited from Enrico's anapop | |
| 100 | 109 | # No executable files to track |
| 101 | - yield data[2], 1 | |
| 102 | - elif len(data) == 4: | |
| 110 | + elif exec_file == '<NOFILES>': | |
| 111 | + self.packages[pkg] = 1 | |
| 103 | 112 | # Recently used packages |
| 104 | - yield data[2], 10 | |
| 105 | - elif data[4] == '<OLD>': | |
| 113 | + elif len(data) == 4: | |
| 114 | + self.packages[pkg] = 10 | |
| 106 | 115 | # Unused packages |
| 107 | - yield data[2], 3 | |
| 108 | - elif data[4] == '<RECENT-CTIME>': | |
| 116 | + elif data[4] == '<OLD>': | |
| 117 | + self.packages[pkg] = 3 | |
| 109 | 118 | # Recently installed packages |
| 110 | - yield data[2], 8 | |
| 111 | -class PopconXapianIndex(xapian.WritableDatabase,Singleton): | |
| 119 | + elif data[4] == '<RECENT-CTIME>': | |
| 120 | + self.packages[pkg] = 8 | |
| 121 | + | |
| 122 | +class PopconXapianIndex(xapian.WritableDatabase): | |
| 112 | 123 | """ |
| 113 | 124 | Data source for popcon submissions defined as a singleton xapian database. |
| 114 | 125 | """ |
| 115 | - def __init__(self,cfg): | |
| 126 | + def __init__(self,cfg,reindex=0,recluster=0): | |
| 116 | 127 | """ |
| 117 | 128 | Set initial attributes. |
| 118 | 129 | """ |
| 119 | - self.path = os.path.expanduser(cfg.popcon_index) | |
| 120 | - self.popcon_dir = os.path.expanduser(cfg.popcon_dir) | |
| 121 | - #self.debtags_path = os.path.expanduser(cfg.tags_db) | |
| 122 | 130 | self.axi = xapian.Database(cfg.axi) |
| 123 | - self.load_index() | |
| 131 | + self.path = os.path.expanduser(cfg.popcon_index) | |
| 132 | + if reindex or not self.load_index(): | |
| 133 | + if not os.path.exists(cfg.popcon_dir): | |
| 134 | + os.makedirs(cfg.popcon_dir) | |
| 135 | + if not os.listdir(cfg.popcon_dir): | |
| 136 | + logging.critical("Popcon dir seems to be empty.") | |
| 137 | + raise Error | |
| 138 | + if not cfg.clustering: | |
| 139 | + self.source_dir = os.path.expanduser(cfg.popcon_dir) | |
| 140 | + else: | |
| 141 | + self.source_dir = os.path.expanduser(cfg.clusters_dir) | |
| 142 | + if not os.path.exists(cfg.clusters_dir): | |
| 143 | + os.makedirs(cfg.clusters_dir) | |
| 144 | + if not os.listdir(cfg.clusters_dir): | |
| 145 | + distance = JaccardDistance() | |
| 146 | + logging.info("Clustering popcon submissions from \'%s\'" | |
| 147 | + % cfg.popcon_dir) | |
| 148 | + logging.info("Clusters will be placed at \'%s\'" | |
| 149 | + % cfg.clusters_dir) | |
| 150 | + data = self.get_submissions(cfg.popcon_dir) | |
| 151 | + if cfg.clustering == "Hierarchical": | |
| 152 | + self.hierarchical_clustering(data,cfg.clusters_dir, | |
| 153 | + distance) | |
| 154 | + else: | |
| 155 | + self.kmedoids_clustering(data,cfg.clusters_dir, | |
| 156 | + distance) | |
| 157 | + self.build_index() | |
| 124 | 158 | |
| 125 | - def parse_submission(self,submission_path,binary=1): | |
| 126 | - """ | |
| 127 | - Parse a popcon submission, generating the names of the valid packages | |
| 128 | - in the vote. | |
| 129 | - """ | |
| 130 | - submission = open(submission_path) | |
| 131 | - for line in submission: | |
| 132 | - if not line.startswith("POPULARITY"): | |
| 133 | - if not line.startswith("END-POPULARITY"): | |
| 134 | - data = line[:-1].split(" ") | |
| 135 | - if len(data) > 3: | |
| 136 | - if binary: | |
| 137 | - # every installed package has the same weight | |
| 138 | - yield data[2], 1 | |
| 139 | - elif data[3] == '<NOFILES>': | |
| 140 | - # No executable files to track | |
| 141 | - yield data[2], 1 | |
| 142 | - elif len(data) == 4: | |
| 143 | - # Recently used packages | |
| 144 | - yield data[2], 10 | |
| 145 | - elif data[4] == '<OLD>': | |
| 146 | - # Unused packages | |
| 147 | - yield data[2], 3 | |
| 148 | - elif data[4] == '<RECENT-CTIME>': | |
| 149 | - # Recently installed packages | |
| 150 | - yield data[2], 8 | |
| 159 | + def __str__(self): | |
| 160 | + return print_index(self) | |
| 151 | 161 | |
| 152 | 162 | def load_index(self): |
| 153 | 163 | """ |
| ... | ... | @@ -159,19 +169,19 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
| 159 | 169 | xapian.Database.__init__(self,self.path) |
| 160 | 170 | except xapian.DatabaseError: |
| 161 | 171 | logging.info("Could not open popcon index.") |
| 162 | - self.new_index() | |
| 172 | + return 0 | |
| 163 | 173 | |
| 164 | - def new_index(self): | |
| 174 | + def build_index(self): | |
| 165 | 175 | """ |
| 166 | - Create a xapian index for popcon submissions at 'popcon_dir' and | |
| 176 | + Create a xapian index for popcon submissions at 'source_dir' and | |
| 167 | 177 | place it at 'self.path'. |
| 168 | 178 | """ |
| 169 | - if not os.path.exists(self.path): | |
| 170 | - os.makedirs(self.path) | |
| 179 | + shutil.rmtree(self.path,1) | |
| 180 | + os.makedirs(self.path) | |
| 171 | 181 | |
| 172 | 182 | try: |
| 173 | 183 | logging.info("Indexing popcon submissions from \'%s\'" % |
| 174 | - self.popcon_dir) | |
| 184 | + self.source_dir) | |
| 175 | 185 | logging.info("Creating new xapian index at \'%s\'" % |
| 176 | 186 | self.path) |
| 177 | 187 | xapian.WritableDatabase.__init__(self,self.path, |
| ... | ... | @@ -180,123 +190,79 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
| 180 | 190 | logging.critical("Could not create popcon xapian index.") |
| 181 | 191 | raise Error |
| 182 | 192 | |
| 183 | - for root, dirs, files in os.walk(self.popcon_dir): | |
| 184 | - for submission in files: | |
| 185 | - submission_path = os.path.join(root, submission) | |
| 193 | + for root, dirs, files in os.walk(self.source_dir): | |
| 194 | + for popcon_file in files: | |
| 195 | + submission = PopconSubmission(os.path.join(root, popcon_file)) | |
| 186 | 196 | doc = xapian.Document() |
| 187 | - doc.set_data(submission) | |
| 188 | - logging.debug("Parsing popcon submission at \'%s\'" % | |
| 189 | - submission_path) | |
| 190 | - for pkg, freq in self.parse_submission(submission_path): | |
| 197 | + doc.set_data(submission.user_id) | |
| 198 | + logging.debug("Parsing popcon submission \'%s\'" % | |
| 199 | + submission.user_id) | |
| 200 | + for pkg, freq in submission.packages.items(): | |
| 191 | 201 | doc.add_term("XP"+pkg,freq) |
| 192 | 202 | for tag in axi_search_pkg_tags(self.axi,pkg): |
| 193 | - print tag | |
| 194 | 203 | doc.add_term(tag,freq) |
| 195 | 204 | doc_id = self.add_document(doc) |
| 196 | 205 | logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) |
| 197 | 206 | # python garbage collector |
| 198 | 207 | gc.collect() |
| 199 | 208 | # flush to disk database changes |
| 200 | - self.flush() | |
| 209 | + self.commit() | |
| 201 | 210 | |
| 202 | -class PopconClusteredData(Singleton): | |
| 203 | - """ | |
| 204 | - Data source for popcon submissions defined as a singleton xapian database. | |
| 205 | - """ | |
| 206 | - def __init__(self,cfg): | |
| 211 | + def get_submissions(self,submissions_dir): | |
| 207 | 212 | """ |
| 208 | - Set initial attributes. | |
| 213 | + Get popcon submissions from popcon_dir | |
| 209 | 214 | """ |
| 210 | - self.popcon_dir = os.path.expanduser(cfg.popcon_dir) | |
| 211 | - self.clusters_dir = os.path.expanduser(cfg.clusters_dir) | |
| 212 | - self.submissions = [] | |
| 213 | - self.clustering() | |
| 215 | + submissions = [] | |
| 216 | + for root, dirs, files in os.walk(submissions_dir): | |
| 217 | + for popcon_file in files: | |
| 218 | + submission = PopconSubmission(os.path.join(root, popcon_file)) | |
| 219 | + submissions.append(submission) | |
| 220 | + return submissions | |
| 214 | 221 | |
| 215 | - def parse_submission(self,submission_path,binary=1): | |
| 216 | - """ | |
| 217 | - Parse a popcon submission, generating the names of the valid packages | |
| 218 | - in the vote. | |
| 219 | - """ | |
| 220 | - submission_file = open(submission_path) | |
| 221 | - for line in submission_file: | |
| 222 | - if not line.startswith("POPULARITY"): | |
| 223 | - if not line.startswith("END-POPULARITY"): | |
| 224 | - data = line[:-1].split(" ") | |
| 225 | - if len(data) > 3: | |
| 226 | - if binary: | |
| 227 | - # every installed package has the same weight | |
| 228 | - yield data[2], 1 | |
| 229 | - elif data[3] == '<NOFILES>': | |
| 230 | - # No executable files to track | |
| 231 | - yield data[2], 1 | |
| 232 | - elif len(data) == 4: | |
| 233 | - # Recently used packages | |
| 234 | - yield data[2], 10 | |
| 235 | - elif data[4] == '<OLD>': | |
| 236 | - # Unused packages | |
| 237 | - yield data[2], 3 | |
| 238 | - elif data[4] == '<RECENT-CTIME>': | |
| 239 | - # Recently installed packages | |
| 240 | - yield data[2], 8 | |
| 241 | - | |
| 242 | - def clustering(self): | |
| 222 | + def hierarchical_clustering(self,data,clusters_dir,distance,k=10): | |
| 243 | 223 | """ |
| 244 | - called by init | |
| 245 | - Create a xapian index for popcon submissions at 'popcon_dir' and | |
| 246 | - place it at 'self.path'. | |
| 224 | + Select popcon submissions from popcon_dir and place them at clusters_dir | |
| 247 | 225 | """ |
| 248 | - if not os.path.exists(self.clusters_dir): | |
| 249 | - os.makedirs(self.clusters_dir) | |
| 250 | - | |
| 251 | - logging.info("Clustering popcon submissions from \'%s\'" % | |
| 252 | - self.popcon_dir) | |
| 253 | - logging.info("Clusters will be placed at \'%s\'" % self.clusters_dir) | |
| 226 | + cl = cluster.HierarchicalClustering(data, lambda x,y: | |
| 227 | + distance(x.packages.keys(), | |
| 228 | + y.packages.keys())) | |
| 229 | + clusters = cl.getlevel(0.5) | |
| 230 | + for c in clusters: | |
| 231 | + print "cluster" | |
| 232 | + for submission in c: | |
| 233 | + print submission.user_id | |
| 254 | 234 | |
| 255 | - for root, dirs, files in os.walk(self.popcon_dir): | |
| 256 | - for submission_hash in files: | |
| 257 | - s = PopconSubmission(submission_hash) | |
| 258 | - submission_path = os.path.join(root, submission_hash) | |
| 259 | - logging.debug("Parsing popcon submission \'%s\'" % | |
| 260 | - submission_hash) | |
| 261 | - for pkg, freq in self.parse_submission(submission_path): | |
| 262 | - s.add_pkg(pkg) | |
| 263 | - self.submissions.append(s) | |
| 264 | - | |
| 265 | - distanceFunction = JaccardDistance() | |
| 266 | - # cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | |
| 267 | - # clusters = cl.getlevel(0.5) | |
| 268 | - # for c in clusters: | |
| 269 | - # print "cluster" | |
| 270 | - # for submission in c: | |
| 271 | - # print submission.hash | |
| 272 | - cl = KMedoidsClusteringPopcon(self.submissions, lambda x,y: \ | |
| 273 | - distanceFunction(x.pkgs_list,y.pkgs_list)) | |
| 274 | - #clusters = cl.getclusters(2) | |
| 275 | - medoids = cl.getMedoids(2) | |
| 276 | - print "medoids" | |
| 277 | - for m in medoids: | |
| 278 | - print m.hash | |
| 235 | + def kmedoids_clustering(self,data,clusters_dir,distance,k=10): | |
| 236 | + clusters = KMedoidsClustering(data,lambda x,y: | |
| 237 | + distance(x.packages.keys(), | |
| 238 | + y.packages.keys())) | |
| 239 | + medoids = clusters.getMedoids(2) | |
| 240 | + for submission in medoids: | |
| 241 | + shutil.copyfile(submission.path,os.path.join(clusters_dir, | |
| 242 | + submission.user_id)) | |
| 279 | 243 | |
| 280 | -class KMedoidsClusteringPopcon(cluster.KMeansClustering): | |
| 244 | +class KMedoidsClustering(cluster.KMeansClustering): | |
| 281 | 245 | |
| 282 | 246 | def __init__(self,data,distance): |
| 283 | - if len(data)>100: | |
| 247 | + if len(data)<100: | |
| 248 | + data_sample = data | |
| 249 | + else: | |
| 284 | 250 | data_sample = random.sample(data,100) |
| 285 | 251 | cluster.KMeansClustering.__init__(self, data_sample, distance) |
| 286 | 252 | self.distanceMatrix = {} |
| 287 | 253 | for submission in self._KMeansClustering__data: |
| 288 | - self.distanceMatrix[submission.hash] = {} | |
| 254 | + self.distanceMatrix[submission.user_id] = {} | |
| 289 | 255 | |
| 290 | 256 | def loadDistanceMatrix(self,cluster): |
| 291 | 257 | for i in range(len(cluster)-1): |
| 292 | 258 | for j in range(i+1,len(cluster)): |
| 293 | 259 | try: |
| 294 | - d = self.distanceMatrix[cluster[i].hash][cluster[j].hash] | |
| 260 | + d = self.distanceMatrix[cluster[i].user_id][cluster[j].user_id] | |
| 295 | 261 | logging.debug("Using d[%d,%d]" % (i,j)) |
| 296 | 262 | except: |
| 297 | 263 | d = self.distance(cluster[i],cluster[j]) |
| 298 | - self.distanceMatrix[cluster[i].hash][cluster[j].hash] = d | |
| 299 | - self.distanceMatrix[cluster[j].hash][cluster[i].hash] = d | |
| 264 | + self.distanceMatrix[cluster[i].user_id][cluster[j].user_id] = d | |
| 265 | + self.distanceMatrix[cluster[j].user_id][cluster[i].user_id] = d | |
| 300 | 266 | logging.debug("d[%d,%d] = %.2f" % (i,j,d)) |
| 301 | 267 | |
| 302 | 268 | def getMedoid(self,cluster): |
| ... | ... | @@ -308,22 +274,19 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): |
| 308 | 274 | self.loadDistanceMatrix(cluster) |
| 309 | 275 | medoidDistance = sys.maxint |
| 310 | 276 | for i in range(len(cluster)): |
| 311 | - totalDistance = sum(self.distanceMatrix[cluster[i].hash].values()) | |
| 277 | + totalDistance = sum(self.distanceMatrix[cluster[i].user_id].values()) | |
| 312 | 278 | print "totalDistance[",i,"]=",totalDistance |
| 313 | 279 | if totalDistance < medoidDistance: |
| 314 | 280 | medoidDistance = totalDistance |
| 315 | 281 | medoid = i |
| 316 | 282 | print "medoidDistance:",medoidDistance |
| 317 | - logging.debug("Cluster medoid: [%d] %s" % (medoid, cluster[medoid].hash)) | |
| 283 | + logging.debug("Cluster medoid: [%d] %s" % (medoid, | |
| 284 | + cluster[medoid].user_id)) | |
| 318 | 285 | return cluster[medoid] |
| 319 | 286 | |
| 320 | 287 | def assign_item(self, item, origin): |
| 321 | 288 | """ |
| 322 | 289 | Assigns an item from a given cluster to the closest located cluster |
| 323 | - | |
| 324 | - PARAMETERS | |
| 325 | - item - the item to be moved | |
| 326 | - origin - the originating cluster | |
| 327 | 290 | """ |
| 328 | 291 | closest_cluster = origin |
| 329 | 292 | for cluster in self._KMeansClustering__clusters: |
| ... | ... | @@ -332,7 +295,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): |
| 332 | 295 | |
| 333 | 296 | if closest_cluster != origin: |
| 334 | 297 | self.move_item(item, origin, closest_cluster) |
| 335 | - logging.debug("Item changed cluster: %s" % item.hash) | |
| 298 | + logging.debug("Item changed cluster: %s" % item.user_id) | |
| 336 | 299 | return True |
| 337 | 300 | else: |
| 338 | 301 | return False |
| ... | ... | @@ -342,5 +305,5 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): |
| 342 | 305 | Generate n clusters and return their medoids. |
| 343 | 306 | """ |
| 344 | 307 | medoids = [self.getMedoid(cluster) for cluster in self.getclusters(n)] |
| 345 | - logging.info("Clustering completed and the following centroids were found: %s" % [c.hash for c in medoids]) | |
| 308 | + logging.info("Clustering completed and the following centroids were found: %s" % [c.user_id for c in medoids]) | |
| 346 | 309 | return medoids | ... | ... |