diff --git a/src/data.py b/src/data.py index 15d617e..80d404e 100644 --- a/src/data.py +++ b/src/data.py @@ -31,6 +31,7 @@ import shutil from error import Error from singleton import Singleton from dissimilarity import * +from config import Config def axi_search_pkgs(axi,pkgs_list): terms = ["XP"+item for item in pkgs_list] @@ -38,19 +39,22 @@ def axi_search_pkgs(axi,pkgs_list): enquire = xapian.Enquire(axi) enquire.set_query(query) matches = enquire.get_mset(0,axi.get_doccount()) - return matches + return [m.docid for m in matches] def axi_search_pkg_tags(axi,pkg): enquire = xapian.Enquire(axi) enquire.set_query(xapian.Query("XP"+pkg)) matches = enquire.get_mset(0,1) if not matches: - #logging.debug("Package %s not found in items repository" % pkg) - return [] + logging.debug("Package %s not found in items repository" % pkg) + return False for m in matches: tags = [term.term for term in axi.get_document(m.docid).termlist() if term.term.startswith("XT")] - return tags + if not tags: + return "notags" + else: + return tags def print_index(index): output = "\n---\n" + xapian.Database.__repr__(index) + "\n---\n" @@ -96,7 +100,7 @@ class SampleAptXapianIndex(xapian.WritableDatabase): xapian.DB_CREATE_OR_OVERWRITE) sample = axi_search_pkgs(axi,pkgs_list) for package in sample: - doc_id = self.add_document(axi.get_document(package.docid)) + doc_id = self.add_document(axi.get_document(package)) def __str__(self): return print_index(self) @@ -115,6 +119,14 @@ class PopconSubmission(): output += "\n "+pkg+": "+str(weight) return output + def apps(self,axi): + apps = {} + for pkg in self.packages.keys(): + tags = axi_search_pkg_tags(self.axi,pkg) + if "XTrole::program" in tags: + apps[pkg] = self.packages[pkg] + return apps + def load(self,binary=1): """ Parse a popcon submission, generating the names of the valid packages @@ -159,6 +171,16 @@ class PopconXapianIndex(xapian.WritableDatabase): self.path = os.path.expanduser(cfg.popcon_index) self.source_dir = os.path.expanduser(cfg.popcon_dir) self.max_popcon = cfg.max_popcon + self.valid_pkgs = [] + # file format: one pkg_name per line + with open(os.path.join(cfg.filters,cfg.pkgs_filter)) as valid_pkgs: + self.valid_pkgs = [line.strip() for line in valid_pkgs + if not line.startswith("#")] + logging.debug("Considering %d valid packages" % len(self.valid_pkgs)) + with open(os.path.join(cfg.filters,"tags")) as valid_tags: + self.valid_tags = [line.strip() for line in valid_tags + if not line.startswith("#")] + logging.debug("Considering %d valid tags" % len(self.valid_tags)) if not cfg.index_mode == "old" or not self.load_index(): if not os.path.exists(cfg.popcon_dir): os.makedirs(cfg.popcon_dir) @@ -243,10 +265,16 @@ class PopconXapianIndex(xapian.WritableDatabase): logging.debug("Parsing popcon submission \'%s\'" % submission.user_id) for pkg, freq in submission.packages.items(): - doc.add_term("XP"+pkg,freq) - #if axi_search_pkg_tags(self.axi,pkg): - # for tag in axi_search_pkg_tags(self.axi,pkg): - # doc.add_term(tag,freq) + if pkg in self.valid_pkgs: + tags = axi_search_pkg_tags(self.axi,pkg) + # if the package was foung in axi + if tags: + doc.add_term("XP"+pkg,freq) + # if the package has tags associated with it + if not tags == "notags": + for tag in tags: + if tag in self.valid_tags: + doc.add_term(tag,freq) doc_id = self.add_document(doc) doc_count += 1 logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) @@ -256,7 +284,7 @@ class PopconXapianIndex(xapian.WritableDatabase): try: self.commit() except: - self.flush() # deprecated function, used for old lib version + self.flush() # deprecated function, used for compatibility with old lib version def get_submissions(self,submissions_dir): """ @@ -288,9 +316,7 @@ class KMedoidsClustering(cluster.KMeansClustering): data_sample = data else: data_sample = random.sample(data,max_data) - print data_sample cluster.KMeansClustering.__init__(self, data_sample, distance) - # cluster.KMeansClustering.__init__(self, data, distance) self.distanceMatrix = {} for submission in self._KMeansClustering__data: self.distanceMatrix[submission.user_id] = {} diff --git a/src/evaluation.py b/src/evaluation.py index 9e96762..3697618 100644 --- a/src/evaluation.py +++ b/src/evaluation.py @@ -25,6 +25,7 @@ import random from collections import defaultdict import logging +from error import Error from user import * from recommender import * from singleton import Singleton @@ -271,11 +272,15 @@ class CrossValidation: """ Perform cross-validation. """ - # - cross_item_score = dict.fromkeys(user.pkg_profile,1) + # Extracting user profile scores from cross validation + cross_item_score = {} + for pkg in user.pkg_profile: + cross_item_score[pkg] = user.item_score[pkg] partition_size = int(len(cross_item_score)*self.partition_proportion) + # main iteration for r in range(self.rounds): round_partition = {} + # move items from cross_item_score to round-partition for j in range(partition_size): if len(cross_item_score)>0: random_key = random.choice(cross_item_score.keys()) @@ -283,20 +288,25 @@ class CrossValidation: logging.critical("Empty cross_item_score.") raise Error round_partition[random_key] = cross_item_score.pop(random_key) - #logging.debug("Round partition: %s",str(round_partition)) - #logging.debug("Cross item-score: %s",str(cross_item_score)) + logging.debug("Round partition: %s",str(round_partition)) + logging.debug("Cross item-score: %s",str(cross_item_score)) + # round user is created with remaining items round_user = User(cross_item_score) result_size = int(self.recommender.items_repository.get_doccount()* self.result_proportion) predicted_result = self.recommender.get_recommendation(round_user,result_size) - #print len(round_partition) + if not predicted_result.size: + logging.critical("No recommendation produced. Abort cross-validation.") + raise Error + # partition is considered the expected result real_result = RecommendationResult(round_partition) - #logging.debug("Predicted result: %s",predicted_result) + logging.debug("Predicted result: %s",predicted_result) evaluation = Evaluation(predicted_result,real_result, self.recommender.items_repository.get_doccount()) for metric in self.metrics_list: result = evaluation.run(metric) self.cross_results[metric.desc].append(result) + # moving back items from round_partition to cross_item_score while len(round_partition)>0: item,score = round_partition.popitem() cross_item_score[item] = score diff --git a/src/recommender.py b/src/recommender.py index 44c1abc..abc9151 100644 --- a/src/recommender.py +++ b/src/recommender.py @@ -78,15 +78,23 @@ class Recommender: """ Set the recommendation strategy. """ - if strategy_str == "cb": - self.strategy = strategy.ContentBasedStrategy("full") - if strategy_str == "cbt": - self.strategy = strategy.ContentBasedStrategy("tag") - if strategy_str == "cbd": - self.strategy = strategy.ContentBasedStrategy("desc") - if strategy_str == "col": + self.items_repository = xapian.Database(self.cfg.axi) + if "desktop" in strategy_str: + self.items_repository = xapian.Database("/root/.app-recommender/DesktopAxi") + self.cfg.popcon_index = "/root/.app-recommender/popcon-index_desktop_1000" + + if strategy_str == "cb" or strategy_str == "cb_desktop": + self.strategy = strategy.ContentBasedStrategy("full", + self.cfg.profile_size) + if strategy_str == "cbt" or strategy_str == "cbt_desktop": + self.strategy = strategy.ContentBasedStrategy("tag", + self.cfg.profile_size) + if strategy_str == "cbd" or strategy_str == "cbd_desktop": + self.strategy = strategy.ContentBasedStrategy("desc", + self.cfg.profile_size) + if "col" in strategy_str: self.users_repository = data.PopconXapianIndex(self.cfg) - self.strategy = strategy.CollaborativeStrategy(20) + self.strategy = strategy.CollaborativeStrategy(self.cfg.k_neighbors) def get_recommendation(self,user,result_size=100): """ diff --git a/src/strategy.py b/src/strategy.py index 8ad6b8c..9982c2c 100644 --- a/src/strategy.py +++ b/src/strategy.py @@ -140,7 +140,7 @@ class ContentBasedStrategy(RecommendationStrategy): """ Content-based recommendation strategy based on Apt-xapian-index. """ - def __init__(self,content,profile_size=50): + def __init__(self,content,profile_size): self.description = "Content-based" self.content = content self.profile_size = profile_size @@ -149,8 +149,8 @@ class ContentBasedStrategy(RecommendationStrategy): """ Perform recommendation strategy. """ - profile = user.profile(rec.items_repository,self.content, - self.profile_size) + profile = user.content_profile(rec.items_repository,self.content, + self.profile_size) # prepair index for querying user profile query = xapian.Query(xapian.Query.OP_OR,profile) enquire = xapian.Enquire(rec.items_repository) @@ -188,7 +188,8 @@ class CollaborativeStrategy(RecommendationStrategy): """ Perform recommendation strategy. """ - profile = ["XP"+package for package in user.pkg_profile] + profile = ["XP"+package for package in + user.filter_pkg_profile("/root/.app-recommender/filters/program")] # prepair index for querying user profile query = xapian.Query(xapian.Query.OP_OR,profile) enquire = xapian.Enquire(rec.users_repository) @@ -210,13 +211,15 @@ class CollaborativeStrategy(RecommendationStrategy): eset = enquire.get_eset(recommendation_size,rset,PkgExpandDecider()) # compose result dictionary item_score = {} + ranking = [] for e in eset: package = e.term.lstrip("XP") tags = axi_search_pkg_tags(rec.items_repository,package) #[FIXME] set this constraint somehow #if "XTrole::program" in tags: item_score[package] = e.weight - return recommender.RecommendationResult(item_score) + ranking.append(m.document.get_data()) + return recommender.RecommendationResult(item_score, ranking) class DemographicStrategy(RecommendationStrategy): """ diff --git a/src/user.py b/src/user.py index 26c5286..11a7ec8 100644 --- a/src/user.py +++ b/src/user.py @@ -19,8 +19,10 @@ __license__ = """ along with this program. If not, see . """ +import os import random import commands +import datetime import xapian import logging import apt @@ -43,9 +45,10 @@ class FilterDescription(xapian.ExpandDecider): """ def __call__(self, term): """ - Return true if the term is a tag, else false. + Return true if the term or its stemmed version is part of a package + description. """ - return term.islower() #or term.startswith("Z") + return term.islower() or term.startswith("Z") class DemographicProfile(Singleton): def __init__(self): @@ -84,7 +87,7 @@ class User: self.pkg_profile = self.items() if user_id: - self.id = user_id + self.user_id = user_id else: random.seed() self.id = random.getrandbits(128) @@ -105,7 +108,7 @@ class User: """ self.demographic_profile = DemographicProfile()(profiles_set) - def profile(self,items_repository,content,size): + def content_profile(self,items_repository,content,size): """ Get user profile for a specific type of content: packages tags, description or both (full_profile) @@ -119,10 +122,10 @@ class User: Return most relevant tags for a list of packages. """ enquire = xapian.Enquire(items_repository) - matches = data.axi_search_pkgs(items_repository,self.pkg_profile) + docs = data.axi_search_pkgs(items_repository,self.pkg_profile) rset_packages = xapian.RSet() - for m in matches: - rset_packages.add_document(m.docid) + for docid in docs: + rset_packages.add_document(docid) # statistically good differentiators eset_tags = enquire.get_eset(size, rset_packages, FilterTag()) profile = [res.term for res in eset_tags] @@ -134,10 +137,10 @@ class User: text descriptions. """ enquire = xapian.Enquire(items_repository) - matches = data.axi_search_pkgs(items_repository,self.pkg_profile) + docs = data.axi_search_pkgs(items_repository,self.pkg_profile) rset_packages = xapian.RSet() - for m in matches: - rset_packages.add_document(m.docid) + for docid in docs: + rset_packages.add_document(docid) eset_keywords = enquire.get_eset(size, rset_packages, FilterDescription()) profile = [res.term for res in eset_keywords] @@ -152,21 +155,19 @@ class User: desc_profile = self.desc_profile(items_repository,size)[:size/2] return tag_profile+desc_profile - def app_pkg_profile(self,axi): + def filter_pkg_profile(self,filter_file): """ - Return list of packages that are applications. + Return list of packages from profile listed in the filter_file. """ old_profile_size = len(self.pkg_profile) - for p in self.pkg_profile[:]: #iterate list copy - tags = data.axi_search_pkg_tags(axi,p) - try: - - if not "XTrole::program" in tags: - self.pkg_profile.remove(p) - except: - logging.debug("Package not found in axi: %s" % p) + with open(filter_file) as valid: + valid_pkgs = [line.strip() for line in valid] + for pkg in self.pkg_profile[:]: #iterate list copy + if pkg not in valid_pkgs: + self.pkg_profile.remove(pkg) + logging.debug("Discarded package %s during profile filtering" % pkg) profile_size = len(self.pkg_profile) - logging.debug("App package profile: reduced packages profile size \ + logging.debug("Filtered package profile: reduced packages profile size \ from %d to %d." % (old_profile_size, profile_size)) return self.pkg_profile @@ -193,6 +194,33 @@ class User: from %d to %d." % (old_profile_size, profile_size)) return self.pkg_profile +class RandomPopcon(User): + def __init__(self,submissions_dir,pkgs_filter=0): + """ + Set initial parameters. + """ + item_score = {} + len_profile = 0 + while len_profile < 100: + path = random.choice([os.path.join(root, submission) for + root, dirs, files in os.walk(submissions_dir) + for submission in files]) + user = PopconSystem(path) + if pkgs_filter: + user.filter_pkg_profile(pkgs_filter) + len_profile = len(user.pkg_profile) + submission = data.PopconSubmission(path) + User.__init__(self,submission.packages,submission.user_id) + +class PopconSystem(User): + def __init__(self,path): + """ + Set initial parameters. + """ + item_score = {} + submission = data.PopconSubmission(path) + User.__init__(self,submission.packages,submission.user_id) + class LocalSystem(User): """ Extend the class User to consider the packages installed on the local @@ -207,6 +235,7 @@ class LocalSystem(User): for line in dpkg_output.splitlines(): pkg = line.split('\t')[0] item_score[pkg] = 1 + self.user_id = "local-"+str(datetime.datetime.now()) User.__init__(self,item_score) def no_auto_pkg_profile(self): -- libgit2 0.21.2