diff --git a/src/config.py b/src/config.py index 8183079..72ec997 100644 --- a/src/config.py +++ b/src/config.py @@ -41,6 +41,8 @@ class Config(): self.tags_index = "~/.app-recommender/debtags_index" self.axi = "/var/lib/apt-xapian-index/index" self.axi_values = "/var/lib/apt-xapian-index/values" + self.popcon_index = "~/.app-recommender/popcon_index" + self.popcon_dir = "~/.app-recommender/popcon_dir" self.strategy = "ct" # defaults to the cheapest one self.reindex = 0 self.load_options() @@ -62,6 +64,8 @@ class Config(): print " -i, --tagsindex=PATH Path to debtags dedicated index." print " -r, --force-reindex Force reindexing debtags database." print " -a, --axi=PATH Path to Apt-xapian-index." + print " -p, --popconindex=PATH Path to popcon dedicated index." + print " -m, --popcondir=PATH Path to popcon submissions dir." print " -s, --strategy=OPTION Recommendation strategy." print "" print " [ strategy options ] " @@ -104,10 +108,13 @@ class Config(): self.tags_index = self.read_option('recommender', 'tags_index') self.reindex = self.read_option('recommender', 'reindex') self.axi = self.read_option('recommender', 'axi') + self.popcon_index = self.read_option('recommender', 'popcon_index') + self.popcon_dir = self.read_option('recommender', 'popcon_dir') - short_options = "hdvo:c:t:i:ra:s:" + short_options = "hdvo:c:t:i:ra:p:m:s:" long_options = ["help", "debug", "verbose", "output=", "config=", - "tagsdb=", "tagsindex=", "reindex", "axi=", "strategy="] + "tagsdb=", "tagsindex=", "reindex", "axi=", + "popconindex=", "popcondir=", "strategy="] try: opts, args = getopt.getopt(sys.argv[1:], short_options, long_options) @@ -138,6 +145,10 @@ class Config(): elif o in ("-a", "--axi"): self.axi = p + "/index" self.axi_values = p + "/values" + elif o in ("-p", "--popconindex"): + self.popcon_index = p + elif o in ("-p", "--popcondir"): + self.popcon_dir = p elif o in ("-s", "--strategy"): self.strategy = p else: diff --git a/src/data.py b/src/data.py index b950556..0f62530 100644 --- a/src/data.py +++ b/src/data.py @@ -19,6 +19,7 @@ import os import sys +import gc import re import xapian import axi @@ -53,6 +54,21 @@ def normalize_tags(string): """ return string.replace(':','_').replace('-','\'') +def load_debtags_db(db_path): + """ + Load debtags database from the source file. + """ + tag_filter = re.compile(r"^special::.+$|^.+::TODO$") + try: + db_file = open(db_path, "r") + debtags_db = debtags.DB() + debtags_db.read(db_file,lambda x: not tag_filter.match(x)) + db_file.close() + return debtags_db + except: + logging.error("Could not load DebtagsDB from '%s'." % self.db_path) + raise Error + class TagsXapianIndex(xapian.WritableDatabase,Singleton): """ Data source for tags info defined as a singleton xapian database. @@ -76,25 +92,25 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): db_file.close() self.load_index(cfg.reindex) - def load_db(self): - """ - Load debtags database from the source file. - """ - tag_filter = re.compile(r"^special::.+$|^.+::TODO$") - try: - db_file = open(self.db_path, "r") - self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) - db_file.close() - except: - logging.error("Could not load DebtagsDB from '%s'." % self.db_path) - raise Error +# def load_db(self): +# """ +# Load debtags database from the source file. +# """ +# tag_filter = re.compile(r"^special::.+$|^.+::TODO$") +# try: +# db_file = open(self.db_path, "r") +# self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) +# db_file.close() +# except: +# logging.error("Could not load DebtagsDB from '%s'." % self.db_path) +# raise Error def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): """ Return most relevant tags considering a list of packages. """ if not self.debtags_db.package_count(): - self.load_db() + self.debtags_db = load_debtags_db(self.db_path) relevant_db = self.debtags_db.choose_packages(pkgs_list) relevance_index = debtags.relevance_index_function(self.debtags_db, relevant_db) @@ -117,7 +133,7 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): logging.info("Index must be updated.") reindex = 1 except xapian.DatabaseError: - logging.info("Could not open index.") + logging.info("Could not open debtags index.") reindex =1 if reindex: @@ -126,13 +142,15 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): def new_index(self): """ Create a xapian index for debtags info based on 'debtags_db' and - place it at 'index_path'. + place it at 'self.path'. """ if not os.path.exists(self.path): os.makedirs(self.path) try: - logging.info("Creating new xapian index for debtags at \'%s\'" % + logging.info("Indexing debtags info from \'%s\'" % + self.db_path) + logging.info("Creating new xapian index at \'%s\'" % self.path) xapian.WritableDatabase.__init__(self,self.path, xapian.DB_CREATE_OR_OVERWRITE) @@ -140,7 +158,7 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): logging.critical("Could not create xapian index.") raise Error - self.load_db() + self.debtags_db = load_debtags_db(self.db_path) self.set_metadata("md5",self.db_md5) for pkg,tags in self.debtags_db.iter_packages_tags(): @@ -149,4 +167,94 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): for tag in tags: doc.add_term(normalize_tags(tag)) doc_id = self.add_document(doc) - logging.debug("Indexing doc %d",doc_id) + logging.debug("Debtags Xapian: Indexing doc %d",doc_id) + +class PopconXapianIndex(xapian.WritableDatabase,Singleton): + """ + Data source for popcon submissions defined as a singleton xapian database. + """ + def __init__(self,cfg): + """ + Set initial attributes. + """ + self.path = os.path.expanduser(cfg.popcon_index) + self.popcon_dir = os.path.expanduser(cfg.popcon_dir) + self.debtags_path = os.path.expanduser(cfg.tags_db) + self.load_index() + + def parse_submission(self,submission_path,binary=1): + """ + Parse a popcon submission, generating the names of the valid packages + in the vote. + """ + submission = open(submission_path) + for line in submission: + if not line.startswith("POPULARITY"): + if not line.startswith("END-POPULARITY"): + data = line[:-1].split(" ") + if len(data) > 3: + if binary: + # every installed package has the same weight + yield data[2], 1 + elif data[3] == '': + # No executable files to track + yield data[2], 1 + elif len(data) == 4: + # Recently used packages + yield data[2], 10 + elif data[4] == '': + # Unused packages + yield data[2], 3 + elif data[4] == '': + # Recently installed packages + yield data[2], 8 + + def load_index(self): + """ + Load an existing popcon index. + """ + try: + logging.info("Opening existing popcon xapian index at \'%s\'" + % self.path) + xapian.Database.__init__(self,self.path) + except xapian.DatabaseError: + logging.info("Could not open popcon index.") + self.new_index() + + def new_index(self): + """ + Create a xapian index for popcon submissions at 'popcon_dir' and + place it at 'self.path'. + """ + if not os.path.exists(self.path): + os.makedirs(self.path) + debtags_db = load_debtags_db(self.debtags_path) + + try: + logging.info("Indexing popcon submissions from \'%s\'" % + self.popcon_dir) + logging.info("Creating new xapian index at \'%s\'" % + self.path) + xapian.WritableDatabase.__init__(self,self.path, + xapian.DB_CREATE_OR_OVERWRITE) + except xapian.DatabaseError: + logging.critical("Could not create popcon xapian index.") + raise Error + + for root, dirs, files in os.walk(self.popcon_dir): + for submission in files: + submission_path = os.path.join(root, submission) + doc = xapian.Document() + doc.set_data(submission) + logging.debug("Parsing popcon submission at \'%s\'" % + submission_path) + for pkg, freq in self.parse_submission(submission_path): + doc.add_term(pkg,freq) + for tag in debtags_db.tags_of_package(pkg): + doc.add_term("XT"+tag,freq) + doc_id = self.add_document(doc) + logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) + # python garbage collector + gc.collect() + # flush to disk database changes + self.flush() diff --git a/src/recommender.py b/src/recommender.py index 3fbf6b7..9a30b6d 100644 --- a/src/recommender.py +++ b/src/recommender.py @@ -83,6 +83,14 @@ class Recommender: self.items_repository = xapian.Database(cfg.axi) self.strategy = AxiContentBasedStrategy() + def col(self,cfg): + """ + Set recommender attributes to perform collaborative recommendation + using popcon-xapian-index as source data. + """ + self.users_repository = PopconXapianIndex(cfg) + self.strategy = CollaborativeStrategy() + def set_strategy(self,strategy): """ Set the recommendation strategy. diff --git a/src/strategy.py b/src/strategy.py index fe7ef8a..5732cc8 100644 --- a/src/strategy.py +++ b/src/strategy.py @@ -48,7 +48,6 @@ class PopularityHeuristic(ReputationHeuristic): """ pass - class PkgMatchDecider(xapian.MatchDecider): """ Extend xapian.MatchDecider to not consider installed packages. @@ -67,6 +66,64 @@ class PkgMatchDecider(xapian.MatchDecider): """ return doc.get_data() not in self.installed_pkgs +class UserMatchDecider(xapian.MatchDecider): + """ + Extend xapian.MatchDecider to match similar profiles. + """ + + def __init__(self, profile): + """ + Set initial parameters. + """ + xapian.MatchDecider.__init__(self) + self.profile = profile + print "mdecider:",profile + + def __call__(self, doc): + """ + True if the user has more the half of packages from profile. + """ + profile_size = len(self.profile) + pkg_match=0 + for term in doc: + if term.term in self.profile: + pkg_match = pkg_match+1 + print "id",doc.get_docid(),"match",pkg_match + return pkg_match >= profile_size/2 + +class PkgExpandDecider(xapian.ExpandDecider): + """ + Extend xapian.ExpandDecider to consider packages only. + """ + + def __init__(self): + """ + Call base class init. + """ + xapian.ExpandDecider.__init__(self) + + def __call__(self, term): + """ + True if the term is a package. + """ + return not term.startswith("XT") + +class TagExpandDecider(xapian.ExpandDecider): + """ + Extend xapian.ExpandDecider to consider tags only. + """ + + def __init__(self, profile): + """ + Call base class init. + """ + xapian.ExpandDecider.__init__(self) + + def __call__(self, doc): + """ + True if the user has more the half of packages from profile. + """ + return term.startswith("XT") class RecommendationStrategy: """ @@ -82,7 +139,8 @@ class ItemReputationStrategy(RecommendationStrategy): """ Perform recommendation strategy. """ - return RecomendationResult() + logging.critical("Item reputation recommendation strategy is not yet implemented.") + raise Error class ContentBasedStrategy(RecommendationStrategy): """ @@ -133,15 +191,41 @@ class AxiContentBasedStrategy(RecommendationStrategy): item_score[m.document.get_data()] = m.rank return recommender.RecommendationResult(item_score,20) -class ColaborativeStrategy(RecommendationStrategy): +class CollaborativeStrategy(RecommendationStrategy): """ Colaborative recommendation strategy. """ - def run(self,user,users_repository,similarity_measure): + #def run(self,rec,user,similarity_measure): + def run(self,rec,user): """ Perform recommendation strategy. """ - return RecomendationResult() + profile = user.maximal_pkg_profile() + query = xapian.Query(xapian.Query.OP_OR,profile) + enquire = xapian.Enquire(rec.users_repository) + enquire.set_query(query) + + try: + #mset = enquire.get_mset(0, 182, None, UserMatchDecider(profile)) + mset = enquire.get_mset(0, 20) + except xapian.DatabaseError as error: + logging.critical(error.get_msg()) + raise Error + + rset = xapian.RSet() + for m in mset: + rset.add_document(m.document.get_docid()) + logging.debug("Counting as relevant submission %s" % + m.document.get_data()) + + eset = enquire.get_eset(20,rset,PkgExpandDecider()) + rank = 0 + item_score = {} + for term in eset: + item_score[term.term] = rank + rank = rank+1 + + return recommender.RecommendationResult(item_score,20) class KnowledgeBasedStrategy(RecommendationStrategy): """ @@ -151,7 +235,8 @@ class KnowledgeBasedStrategy(RecommendationStrategy): """ Perform recommendation strategy. """ - return RecomendationResult() + logging.critical("Knowledge-based recommendation strategy is not yet implemented.") + raise Error class DemographicStrategy(RecommendationStrategy): """ @@ -161,4 +246,5 @@ class DemographicStrategy(RecommendationStrategy): """ Perform recommendation strategy. """ - return RecomendationResult() + logging.critical("Demographic recommendation strategy is not yet implemented.") + raise Error diff --git a/src/user.py b/src/user.py index 489ad02..b8c4a21 100644 --- a/src/user.py +++ b/src/user.py @@ -95,6 +95,7 @@ class User: profile_size = len(self.pkg_profile) logging.info("Reduced packages profile size from %d to %d." % (old_profile_size, profile_size)) + return self.pkg_profile class LocalSystem(User): """ -- libgit2 0.21.2