Commit 9e2461efb830d0a400f46053db69b405b4b14aa4
1 parent
a69758a8
Exists in
master
and in
1 other branch
Desconsidering TagsXapianIndex for now due to performance issues; Demographic pr…
…ofiles implementation (no strategies yet); Minor bugs fixing.
Showing
7 changed files
with
222 additions
and
179 deletions
Show diff stats
src/app_recommender.py
| @@ -28,7 +28,7 @@ from datetime import timedelta | @@ -28,7 +28,7 @@ from datetime import timedelta | ||
| 28 | from config import * | 28 | from config import * |
| 29 | from data import * | 29 | from data import * |
| 30 | from evaluation import * | 30 | from evaluation import * |
| 31 | -from similarity import * | 31 | +from dissimilarity import * |
| 32 | from recommender import * | 32 | from recommender import * |
| 33 | from strategy import * | 33 | from strategy import * |
| 34 | from user import * | 34 | from user import * |
src/clustering.py
| @@ -26,7 +26,7 @@ from datetime import timedelta | @@ -26,7 +26,7 @@ from datetime import timedelta | ||
| 26 | 26 | ||
| 27 | from config import * | 27 | from config import * |
| 28 | from data import * | 28 | from data import * |
| 29 | -from similarity import * | 29 | +from dissimilarity import * |
| 30 | from error import Error | 30 | from error import Error |
| 31 | 31 | ||
| 32 | if __name__ == '__main__': | 32 | if __name__ == '__main__': |
src/cross_validation.py
| @@ -29,7 +29,7 @@ from datetime import timedelta | @@ -29,7 +29,7 @@ from datetime import timedelta | ||
| 29 | from config import * | 29 | from config import * |
| 30 | from data import * | 30 | from data import * |
| 31 | from evaluation import * | 31 | from evaluation import * |
| 32 | -from similarity import * | 32 | +from dissimilarity import * |
| 33 | from recommender import * | 33 | from recommender import * |
| 34 | from strategy import * | 34 | from strategy import * |
| 35 | from user import * | 35 | from user import * |
src/data.py
| @@ -28,36 +28,38 @@ import axi | @@ -28,36 +28,38 @@ import axi | ||
| 28 | from debian import debtags | 28 | from debian import debtags |
| 29 | import logging | 29 | import logging |
| 30 | import hashlib | 30 | import hashlib |
| 31 | +import random | ||
| 31 | 32 | ||
| 32 | from error import Error | 33 | from error import Error |
| 33 | from singleton import Singleton | 34 | from singleton import Singleton |
| 34 | import cluster | 35 | import cluster |
| 35 | -from similarity import * | ||
| 36 | - | ||
| 37 | -class Item: | ||
| 38 | - """ | ||
| 39 | - Generic item definition. | ||
| 40 | - """ | ||
| 41 | - | ||
| 42 | -class Package(Item): | ||
| 43 | - """ | ||
| 44 | - Definition of a GNU/Linux application as a recommender item. | ||
| 45 | - """ | ||
| 46 | - def __init__(self,package_name): | ||
| 47 | - """ | ||
| 48 | - Set initial attributes. | ||
| 49 | - """ | ||
| 50 | - self.package_name = package_name | ||
| 51 | - | ||
| 52 | -def normalize_tags(string): | ||
| 53 | - """ | ||
| 54 | - Substitute string characters : by _ and - by '. | ||
| 55 | - Examples: | ||
| 56 | - admin::package-management -> admin__package'management | ||
| 57 | - implemented-in::c++ -> implemented-in__c++ | ||
| 58 | - """ | ||
| 59 | - return string.replace(':','_').replace('-','\'') | ||
| 60 | - | 36 | +from dissimilarity import * |
| 37 | + | ||
| 38 | +#class Item: | ||
| 39 | +# """ | ||
| 40 | +# Generic item definition. | ||
| 41 | +# """ | ||
| 42 | +# | ||
| 43 | +#class Package(Item): | ||
| 44 | +# """ | ||
| 45 | +# Definition of a GNU/Linux application as a recommender item. | ||
| 46 | +# """ | ||
| 47 | +# def __init__(self,package_name): | ||
| 48 | +# """ | ||
| 49 | +# Set initial attributes. | ||
| 50 | +# """ | ||
| 51 | +# self.package_name = package_name | ||
| 52 | +# | ||
| 53 | +#def normalize_tags(string): | ||
| 54 | +# """ | ||
| 55 | +# Substitute string characters : by _ and - by '. | ||
| 56 | +# Examples: | ||
| 57 | +# admin::package-management -> admin__package'management | ||
| 58 | +# implemented-in::c++ -> implemented-in__c++ | ||
| 59 | +# """ | ||
| 60 | +# return string.replace(':','_').replace('-','\'') | ||
| 61 | + | ||
| 62 | +#[FIXME] get pkg tags from axi and remove load_debtags_db method | ||
| 61 | def load_debtags_db(db_path): | 63 | def load_debtags_db(db_path): |
| 62 | """ | 64 | """ |
| 63 | Load debtags database from the source file. | 65 | Load debtags database from the source file. |
| @@ -73,105 +75,105 @@ def load_debtags_db(db_path): | @@ -73,105 +75,105 @@ def load_debtags_db(db_path): | ||
| 73 | logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | 75 | logging.error("Could not load DebtagsDB from '%s'." % self.db_path) |
| 74 | raise Error | 76 | raise Error |
| 75 | 77 | ||
| 76 | -class TagsXapianIndex(xapian.WritableDatabase,Singleton): | ||
| 77 | - """ | ||
| 78 | - Data source for tags info defined as a singleton xapian database. | ||
| 79 | - """ | ||
| 80 | - def __init__(self,cfg): | ||
| 81 | - """ | ||
| 82 | - Set initial attributes. | ||
| 83 | - """ | ||
| 84 | - self.path = os.path.expanduser(cfg.tags_index) | ||
| 85 | - self.db_path = os.path.expanduser(cfg.tags_db) | ||
| 86 | - self.debtags_db = debtags.DB() | ||
| 87 | - | ||
| 88 | - try: | ||
| 89 | - db_file = open(self.db_path) | ||
| 90 | - except IOError: | ||
| 91 | - logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | ||
| 92 | - raise Error | ||
| 93 | - md5 = hashlib.md5() | ||
| 94 | - md5.update(db_file.read()) | ||
| 95 | - self.db_md5 = md5.hexdigest() | ||
| 96 | - db_file.close() | ||
| 97 | - self.load_index(cfg.reindex) | ||
| 98 | - | ||
| 99 | -# def load_db(self): | 78 | +#class TagsXapianIndex(xapian.WritableDatabase,Singleton): |
| 79 | +# """ | ||
| 80 | +# Data source for tags info defined as a singleton xapian database. | ||
| 81 | +# """ | ||
| 82 | +# def __init__(self,cfg): | ||
| 100 | # """ | 83 | # """ |
| 101 | -# Load debtags database from the source file. | 84 | +# Set initial attributes. |
| 102 | # """ | 85 | # """ |
| 103 | -# tag_filter = re.compile(r"^special::.+$|^.+::TODO$") | 86 | +# self.path = os.path.expanduser(cfg.tags_index) |
| 87 | +# self.db_path = os.path.expanduser(cfg.tags_db) | ||
| 88 | +# self.debtags_db = debtags.DB() | ||
| 104 | # try: | 89 | # try: |
| 105 | -# db_file = open(self.db_path, "r") | ||
| 106 | -# self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) | ||
| 107 | -# db_file.close() | ||
| 108 | -# except: | 90 | +# db_file = open(self.db_path) |
| 91 | +# except IOError: | ||
| 109 | # logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | 92 | # logging.error("Could not load DebtagsDB from '%s'." % self.db_path) |
| 110 | # raise Error | 93 | # raise Error |
| 111 | - | ||
| 112 | - def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): | ||
| 113 | - """ | ||
| 114 | - Return most relevant tags considering a list of packages. | ||
| 115 | - """ | ||
| 116 | - if not self.debtags_db.package_count(): | ||
| 117 | - self.debtags_db = load_debtags_db(self.db_path) | ||
| 118 | - relevant_db = self.debtags_db.choose_packages(pkgs_list) | ||
| 119 | - relevance_index = debtags.relevance_index_function(self.debtags_db, | ||
| 120 | - relevant_db) | ||
| 121 | - sorted_relevant_tags = sorted(relevant_db.iter_tags(), | ||
| 122 | - lambda a, b: cmp(relevance_index(a), | ||
| 123 | - relevance_index(b))) | ||
| 124 | - return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:])) | ||
| 125 | - | ||
| 126 | - def load_index(self,reindex): | ||
| 127 | - """ | ||
| 128 | - Load an existing debtags index. | ||
| 129 | - """ | ||
| 130 | - if not reindex: | ||
| 131 | - try: | ||
| 132 | - logging.info("Opening existing debtags xapian index at \'%s\'" | ||
| 133 | - % self.path) | ||
| 134 | - xapian.Database.__init__(self,self.path) | ||
| 135 | - md5 = self.get_metadata("md5") | ||
| 136 | - if not md5 == self.db_md5: | ||
| 137 | - logging.info("Index must be updated.") | ||
| 138 | - reindex = 1 | ||
| 139 | - except xapian.DatabaseError: | ||
| 140 | - logging.info("Could not open debtags index.") | ||
| 141 | - reindex =1 | ||
| 142 | - | ||
| 143 | - if reindex: | ||
| 144 | - self.new_index() | ||
| 145 | - | ||
| 146 | - def new_index(self): | ||
| 147 | - """ | ||
| 148 | - Create a xapian index for debtags info based on 'debtags_db' and | ||
| 149 | - place it at 'self.path'. | ||
| 150 | - """ | ||
| 151 | - if not os.path.exists(self.path): | ||
| 152 | - os.makedirs(self.path) | ||
| 153 | - | ||
| 154 | - try: | ||
| 155 | - logging.info("Indexing debtags info from \'%s\'" % | ||
| 156 | - self.db_path) | ||
| 157 | - logging.info("Creating new xapian index at \'%s\'" % | ||
| 158 | - self.path) | ||
| 159 | - xapian.WritableDatabase.__init__(self,self.path, | ||
| 160 | - xapian.DB_CREATE_OR_OVERWRITE) | ||
| 161 | - except xapian.DatabaseError: | ||
| 162 | - logging.critical("Could not create xapian index.") | ||
| 163 | - raise Error | ||
| 164 | - | ||
| 165 | - self.debtags_db = load_debtags_db(self.db_path) | ||
| 166 | - self.set_metadata("md5",self.db_md5) | ||
| 167 | - | ||
| 168 | - for pkg,tags in self.debtags_db.iter_packages_tags(): | ||
| 169 | - doc = xapian.Document() | ||
| 170 | - doc.set_data(pkg) | ||
| 171 | - for tag in tags: | ||
| 172 | - doc.add_term(normalize_tags(tag)) | ||
| 173 | - doc_id = self.add_document(doc) | ||
| 174 | - logging.debug("Debtags Xapian: Indexing doc %d",doc_id) | 94 | +# md5 = hashlib.md5() |
| 95 | +# md5.update(db_file.read()) | ||
| 96 | +# self.db_md5 = md5.hexdigest() | ||
| 97 | +# db_file.close() | ||
| 98 | +# self.load_index(cfg.reindex) | ||
| 99 | +# | ||
| 100 | +## def load_db(self): | ||
| 101 | +## """ | ||
| 102 | +## Load debtags database from the source file. | ||
| 103 | +## """ | ||
| 104 | +## tag_filter = re.compile(r"^special::.+$|^.+::TODO$") | ||
| 105 | +## try: | ||
| 106 | +## db_file = open(self.db_path, "r") | ||
| 107 | +## self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) | ||
| 108 | +## db_file.close() | ||
| 109 | +## except: | ||
| 110 | +## logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | ||
| 111 | +## raise Error | ||
| 112 | +# | ||
| 113 | +# def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): | ||
| 114 | +# """ | ||
| 115 | +# Return most relevant tags considering a list of packages. | ||
| 116 | +# """ | ||
| 117 | +# if not self.debtags_db.package_count(): | ||
| 118 | +# #print "index vazio" | ||
| 119 | +# self.debtags_db = load_debtags_db(self.db_path) | ||
| 120 | +# relevant_db = self.debtags_db.choose_packages(pkgs_list) | ||
| 121 | +# relevance_index = debtags.relevance_index_function(self.debtags_db, | ||
| 122 | +# relevant_db) | ||
| 123 | +# sorted_relevant_tags = sorted(relevant_db.iter_tags(), | ||
| 124 | +# lambda a, b: cmp(relevance_index(a), | ||
| 125 | +# relevance_index(b))) | ||
| 126 | +# return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:])) | ||
| 127 | +# | ||
| 128 | +# def load_index(self,reindex): | ||
| 129 | +# """ | ||
| 130 | +# Load an existing debtags index. | ||
| 131 | +# """ | ||
| 132 | +# if not reindex: | ||
| 133 | +# try: | ||
| 134 | +# logging.info("Opening existing debtags xapian index at \'%s\'" | ||
| 135 | +# % self.path) | ||
| 136 | +# xapian.Database.__init__(self,self.path) | ||
| 137 | +# md5 = self.get_metadata("md5") | ||
| 138 | +# if not md5 == self.db_md5: | ||
| 139 | +# logging.info("Index must be updated.") | ||
| 140 | +# reindex = 1 | ||
| 141 | +# except xapian.DatabaseError: | ||
| 142 | +# logging.info("Could not open debtags index.") | ||
| 143 | +# reindex =1 | ||
| 144 | +# | ||
| 145 | +# if reindex: | ||
| 146 | +# self.new_index() | ||
| 147 | +# | ||
| 148 | +# def new_index(self): | ||
| 149 | +# """ | ||
| 150 | +# Create a xapian index for debtags info based on 'debtags_db' and | ||
| 151 | +# place it at 'self.path'. | ||
| 152 | +# """ | ||
| 153 | +# if not os.path.exists(self.path): | ||
| 154 | +# os.makedirs(self.path) | ||
| 155 | +# | ||
| 156 | +# try: | ||
| 157 | +# logging.info("Indexing debtags info from \'%s\'" % | ||
| 158 | +# self.db_path) | ||
| 159 | +# logging.info("Creating new xapian index at \'%s\'" % | ||
| 160 | +# self.path) | ||
| 161 | +# xapian.WritableDatabase.__init__(self,self.path, | ||
| 162 | +# xapian.DB_CREATE_OR_OVERWRITE) | ||
| 163 | +# except xapian.DatabaseError: | ||
| 164 | +# logging.critical("Could not create xapian index.") | ||
| 165 | +# raise Error | ||
| 166 | +# | ||
| 167 | +# self.debtags_db = load_debtags_db(self.db_path) | ||
| 168 | +# self.set_metadata("md5",self.db_md5) | ||
| 169 | +# | ||
| 170 | +# for pkg,tags in self.debtags_db.iter_packages_tags(): | ||
| 171 | +# doc = xapian.Document() | ||
| 172 | +# doc.set_data(pkg) | ||
| 173 | +# for tag in tags: | ||
| 174 | +# doc.add_term(normalize_tags(tag)) | ||
| 175 | +# doc_id = self.add_document(doc) | ||
| 176 | +# logging.debug("Debtags Xapian: Indexing doc %d",doc_id) | ||
| 175 | 177 | ||
| 176 | class PopconXapianIndex(xapian.WritableDatabase,Singleton): | 178 | class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
| 177 | """ | 179 | """ |
| @@ -232,7 +234,7 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): | @@ -232,7 +234,7 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): | ||
| 232 | """ | 234 | """ |
| 233 | if not os.path.exists(self.path): | 235 | if not os.path.exists(self.path): |
| 234 | os.makedirs(self.path) | 236 | os.makedirs(self.path) |
| 235 | - debtags_db = load_debtags_db(self.debtags_path) | 237 | + debtags_db = load_debtags_db(self.debtags_path) #[FIXME] |
| 236 | 238 | ||
| 237 | try: | 239 | try: |
| 238 | logging.info("Indexing popcon submissions from \'%s\'" % | 240 | logging.info("Indexing popcon submissions from \'%s\'" % |
| @@ -254,6 +256,7 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): | @@ -254,6 +256,7 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): | ||
| 254 | submission_path) | 256 | submission_path) |
| 255 | for pkg, freq in self.parse_submission(submission_path): | 257 | for pkg, freq in self.parse_submission(submission_path): |
| 256 | doc.add_term(pkg,freq) | 258 | doc.add_term(pkg,freq) |
| 259 | + #[FIXME] get tags from axi | ||
| 257 | for tag in debtags_db.tags_of_package(pkg): | 260 | for tag in debtags_db.tags_of_package(pkg): |
| 258 | doc.add_term("XT"+tag,freq) | 261 | doc.add_term("XT"+tag,freq) |
| 259 | doc_id = self.add_document(doc) | 262 | doc_id = self.add_document(doc) |
| @@ -334,22 +337,27 @@ class PopconClusteredData(Singleton): | @@ -334,22 +337,27 @@ class PopconClusteredData(Singleton): | ||
| 334 | s.add_pkg(pkg) | 337 | s.add_pkg(pkg) |
| 335 | self.submissions.append(s) | 338 | self.submissions.append(s) |
| 336 | 339 | ||
| 337 | - distanceFunction = JaccardIndex() | ||
| 338 | - cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | ||
| 339 | - clusters = cl.getlevel(0.5) | ||
| 340 | - for c in clusters: | ||
| 341 | - print "cluster" | ||
| 342 | - for submission in c: | ||
| 343 | - print submission.hash | ||
| 344 | - #cl = KMeansClusteringPopcon(self.submissions, | ||
| 345 | - # lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | 340 | + distanceFunction = JaccardDistance() |
| 341 | + # cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | ||
| 342 | + # clusters = cl.getlevel(0.5) | ||
| 343 | + # for c in clusters: | ||
| 344 | + # print "cluster" | ||
| 345 | + # for submission in c: | ||
| 346 | + # print submission.hash | ||
| 347 | + cl = KMedoidsClusteringPopcon(self.submissions, lambda x,y: \ | ||
| 348 | + distanceFunction(x.pkgs_list,y.pkgs_list)) | ||
| 346 | #clusters = cl.getclusters(2) | 349 | #clusters = cl.getclusters(2) |
| 347 | - #medoids = cl.getMedoids(2) | 350 | + medoids = cl.getMedoids(2) |
| 351 | + print "medoids" | ||
| 352 | + for m in medoids: | ||
| 353 | + print m.hash | ||
| 348 | 354 | ||
| 349 | class KMedoidsClusteringPopcon(cluster.KMeansClustering): | 355 | class KMedoidsClusteringPopcon(cluster.KMeansClustering): |
| 350 | 356 | ||
| 351 | def __init__(self,data,distance): | 357 | def __init__(self,data,distance): |
| 352 | - cluster.KMeansClustering.__init__(self, data, distance) | 358 | + if len(data)>100: |
| 359 | + data_sample = random.sample(data,100) | ||
| 360 | + cluster.KMeansClustering.__init__(self, data_sample, distance) | ||
| 353 | self.distanceMatrix = {} | 361 | self.distanceMatrix = {} |
| 354 | for submission in self._KMeansClustering__data: | 362 | for submission in self._KMeansClustering__data: |
| 355 | self.distanceMatrix[submission.hash] = {} | 363 | self.distanceMatrix[submission.hash] = {} |
| @@ -377,7 +385,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): | @@ -377,7 +385,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): | ||
| 377 | for i in range(len(cluster)): | 385 | for i in range(len(cluster)): |
| 378 | totalDistance = sum(self.distanceMatrix[cluster[i].hash].values()) | 386 | totalDistance = sum(self.distanceMatrix[cluster[i].hash].values()) |
| 379 | print "totalDistance[",i,"]=",totalDistance | 387 | print "totalDistance[",i,"]=",totalDistance |
| 380 | - if totalDistance < centroidDistance: | 388 | + if totalDistance < medoidDistance: |
| 381 | medoidDistance = totalDistance | 389 | medoidDistance = totalDistance |
| 382 | medoid = i | 390 | medoid = i |
| 383 | print "medoidDistance:",medoidDistance | 391 | print "medoidDistance:",medoidDistance |
src/dissimilarity.py
| 1 | #!/usr/bin/env python | 1 | #!/usr/bin/env python |
| 2 | """ | 2 | """ |
| 3 | - similarity - python module for classes and methods related to similarity | ||
| 4 | - measuring between two sets of data. | 3 | + dissimilarity - python module for classes and methods related to similarity |
| 4 | + measuring between two sets of data. | ||
| 5 | """ | 5 | """ |
| 6 | __author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | 6 | __author__ = "Tassia Camoes Araujo <tassia@gmail.com>" |
| 7 | __copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | 7 | __copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" |
src/strategy.py
| @@ -144,30 +144,30 @@ class ItemReputationStrategy(RecommendationStrategy): | @@ -144,30 +144,30 @@ class ItemReputationStrategy(RecommendationStrategy): | ||
| 144 | logging.critical("Item reputation recommendation strategy is not yet implemented.") | 144 | logging.critical("Item reputation recommendation strategy is not yet implemented.") |
| 145 | raise Error | 145 | raise Error |
| 146 | 146 | ||
| 147 | -class ContentBasedStrategy(RecommendationStrategy): | ||
| 148 | - """ | ||
| 149 | - Content-based recommendation strategy. | ||
| 150 | - """ | ||
| 151 | - def run(self,rec,user): | ||
| 152 | - """ | ||
| 153 | - Perform recommendation strategy. | ||
| 154 | - """ | ||
| 155 | - profile = user.txi_tag_profile(rec.items_repository,50) | ||
| 156 | - qp = xapian.QueryParser() | ||
| 157 | - query = qp.parse_query(profile) | ||
| 158 | - enquire = xapian.Enquire(rec.items_repository) | ||
| 159 | - enquire.set_query(query) | ||
| 160 | - | ||
| 161 | - try: | ||
| 162 | - mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | ||
| 163 | - except xapian.DatabaseError as error: | ||
| 164 | - logging.critical(error.get_msg()) | ||
| 165 | - raise Error | ||
| 166 | - | ||
| 167 | - item_score = {} | ||
| 168 | - for m in mset: | ||
| 169 | - item_score[m.document.get_data()] = m.rank | ||
| 170 | - return recommender.RecommendationResult(item_score,20) | 147 | +#class ContentBasedStrategy(RecommendationStrategy): |
| 148 | +# """ | ||
| 149 | +# Content-based recommendation strategy. | ||
| 150 | +# """ | ||
| 151 | +# def run(self,rec,user): | ||
| 152 | +# """ | ||
| 153 | +# Perform recommendation strategy. | ||
| 154 | +# """ | ||
| 155 | +# profile = user.txi_tag_profile(rec.items_repository,50) | ||
| 156 | +# qp = xapian.QueryParser() | ||
| 157 | +# query = qp.parse_query(profile) | ||
| 158 | +# enquire = xapian.Enquire(rec.items_repository) | ||
| 159 | +# enquire.set_query(query) | ||
| 160 | +# | ||
| 161 | +# try: | ||
| 162 | +# mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | ||
| 163 | +# except xapian.DatabaseError as error: | ||
| 164 | +# logging.critical(error.get_msg()) | ||
| 165 | +# raise Error | ||
| 166 | +# | ||
| 167 | +# item_score = {} | ||
| 168 | +# for m in mset: | ||
| 169 | +# item_score[m.document.get_data()] = m.rank | ||
| 170 | +# return recommender.RecommendationResult(item_score,20) | ||
| 171 | 171 | ||
| 172 | class AxiContentBasedStrategy(RecommendationStrategy): | 172 | class AxiContentBasedStrategy(RecommendationStrategy): |
| 173 | """ | 173 | """ |
src/user.py
| @@ -19,10 +19,12 @@ __license__ = """ | @@ -19,10 +19,12 @@ __license__ = """ | ||
| 19 | along with this program. If not, see <http://www.gnu.org/licenses/>. | 19 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 20 | """ | 20 | """ |
| 21 | 21 | ||
| 22 | +import random | ||
| 22 | import commands | 23 | import commands |
| 23 | import xapian | 24 | import xapian |
| 24 | import logging | 25 | import logging |
| 25 | import apt | 26 | import apt |
| 27 | +from singleton import Singleton | ||
| 26 | 28 | ||
| 27 | class FilterTag(xapian.ExpandDecider): | 29 | class FilterTag(xapian.ExpandDecider): |
| 28 | """ | 30 | """ |
| @@ -34,51 +36,84 @@ class FilterTag(xapian.ExpandDecider): | @@ -34,51 +36,84 @@ class FilterTag(xapian.ExpandDecider): | ||
| 34 | """ | 36 | """ |
| 35 | return term[:2] == "XT" | 37 | return term[:2] == "XT" |
| 36 | 38 | ||
| 39 | +class DemographicProfile(Singleton): | ||
| 40 | + def __init__(self): | ||
| 41 | + self.admin = set(["admin", "hardware", "mail", "protocol", | ||
| 42 | + "network", "security", "web", "interface::web"]) | ||
| 43 | + self.devel = set(["devel", "role::devel-lib", "role::shared-lib"]) | ||
| 44 | + self.desktop = set(["x11", "accessibility", "game", "junior", "office", | ||
| 45 | + "interface::x11"]) | ||
| 46 | + self.art = set(["field::arts", "sound"]) | ||
| 47 | + self.science = set(["science", "biology", "field::astronomy", | ||
| 48 | + "field::aviation", "field::biology", | ||
| 49 | + "field::chemistry", "field::eletronics", | ||
| 50 | + "field::finance", "field::geography", | ||
| 51 | + "field::geology", "field::linguistics", | ||
| 52 | + "field::mathematics", "field::medicine", | ||
| 53 | + "field::meteorology", "field::physics", | ||
| 54 | + "field::statistics"]) | ||
| 55 | + | ||
| 56 | + def __call__(self,profiles_set): | ||
| 57 | + demographic_profile = set() | ||
| 58 | + for profile in profiles_set: | ||
| 59 | + demographic_profile = (demographic_profile | eval("self."+profile,{},{"self":self})) | ||
| 60 | + return demographic_profile | ||
| 61 | + | ||
| 37 | class User: | 62 | class User: |
| 38 | """ | 63 | """ |
| 39 | Define a user of a recommender. | 64 | Define a user of a recommender. |
| 40 | """ | 65 | """ |
| 41 | - def __init__(self,item_score,user_id=0,demographic_profile=0): | 66 | + def __init__(self,item_score,user_id=0,profiles_set=0): |
| 42 | """ | 67 | """ |
| 43 | - Set initial parameters. | 68 | + Set initial user attributes. If no user_id was passed as parameter, a |
| 69 | + random md5-hash is generated for that purpose. If the demographic | ||
| 70 | + profile was not defined, it defaults to 'desktop' | ||
| 44 | """ | 71 | """ |
| 45 | - self.id = user_id | ||
| 46 | self.item_score = item_score | 72 | self.item_score = item_score |
| 73 | + if user_id: | ||
| 74 | + self.id = user_id | ||
| 75 | + else: | ||
| 76 | + random.seed() | ||
| 77 | + self.id = random.getrandbits(128) | ||
| 47 | self.pkg_profile = self.item_score.keys() | 78 | self.pkg_profile = self.item_score.keys() |
| 48 | - self.demographic_profile = demographic_profile | 79 | + if not profiles_set: |
| 80 | + profiles_set = set(["desktop"]) | ||
| 81 | + self.set_demographic_profile(profiles_set) | ||
| 82 | + | ||
| 83 | + def set_demographic_profile(self,profiles_set): | ||
| 84 | + self.demographic_profile = DemographicProfile()(profiles_set) | ||
| 49 | 85 | ||
| 50 | def items(self): | 86 | def items(self): |
| 51 | """ | 87 | """ |
| 52 | - Return dictionary relating items and repective scores. | 88 | + Return the set of user items. |
| 53 | """ | 89 | """ |
| 54 | - return self.item_score.keys() | 90 | + return set(self.item_score.keys()) |
| 55 | 91 | ||
| 56 | def axi_tag_profile(self,apt_xapian_index,profile_size): | 92 | def axi_tag_profile(self,apt_xapian_index,profile_size): |
| 57 | """ | 93 | """ |
| 58 | Return most relevant tags for a list of packages based on axi. | 94 | Return most relevant tags for a list of packages based on axi. |
| 59 | """ | 95 | """ |
| 60 | - terms = [] | ||
| 61 | - for item in self.pkg_profile: | ||
| 62 | - terms.append("XP"+item) | 96 | + terms = ["XP"+item for item in self.pkg_profile] |
| 63 | query = xapian.Query(xapian.Query.OP_OR, terms) | 97 | query = xapian.Query(xapian.Query.OP_OR, terms) |
| 64 | enquire = xapian.Enquire(apt_xapian_index) | 98 | enquire = xapian.Enquire(apt_xapian_index) |
| 65 | enquire.set_query(query) | 99 | enquire.set_query(query) |
| 66 | rset = xapian.RSet() | 100 | rset = xapian.RSet() |
| 67 | - for m in enquire.get_mset(0,30000): #consider all matches | 101 | + for m in enquire.get_mset(0,apt_xapian_index.get_doccount()): |
| 68 | rset.add_document(m.docid) | 102 | rset.add_document(m.docid) |
| 103 | + # statistically good differentiators between relevant and non-relevant | ||
| 69 | eset = enquire.get_eset(profile_size, rset, FilterTag()) | 104 | eset = enquire.get_eset(profile_size, rset, FilterTag()) |
| 70 | profile = [] | 105 | profile = [] |
| 71 | for res in eset: | 106 | for res in eset: |
| 72 | profile.append(res.term) | 107 | profile.append(res.term) |
| 73 | - logging.debug("%.2f %s" % (res.weight,res.term[2:])) | 108 | + logging.debug("%.2f %s" % (res.weight,res.term.lstrip("XT"))) |
| 74 | return profile | 109 | return profile |
| 75 | 110 | ||
| 76 | - def txi_tag_profile(self,tags_xapian_index,profile_size): | ||
| 77 | - """ | ||
| 78 | - Return most relevant tags for a list of packages based on tags index. | ||
| 79 | - """ | ||
| 80 | - return tags_xapian_index.relevant_tags_from_db(self.pkg_profile, | ||
| 81 | - profile_size) | 111 | + #def txi_tag_profile(self,tags_xapian_index,profile_size): |
| 112 | + # """ | ||
| 113 | + # Return most relevant tags for a list of packages based on tags index. | ||
| 114 | + # """ | ||
| 115 | + # return tags_xapian_index.relevant_tags_from_db(self.pkg_profile, | ||
| 116 | + # profile_size) | ||
| 82 | 117 | ||
| 83 | def maximal_pkg_profile(self): | 118 | def maximal_pkg_profile(self): |
| 84 | """ | 119 | """ |