diff --git a/src/app_recommender.py b/src/app_recommender.py index 00dece2..1e7344e 100755 --- a/src/app_recommender.py +++ b/src/app_recommender.py @@ -28,7 +28,7 @@ from datetime import timedelta from config import * from data import * from evaluation import * -from similarity import * +from dissimilarity import * from recommender import * from strategy import * from user import * diff --git a/src/clustering.py b/src/clustering.py index 2ff4bcb..38da4f1 100755 --- a/src/clustering.py +++ b/src/clustering.py @@ -26,7 +26,7 @@ from datetime import timedelta from config import * from data import * -from similarity import * +from dissimilarity import * from error import Error if __name__ == '__main__': diff --git a/src/cross_validation.py b/src/cross_validation.py index f0aaf8e..326ba05 100755 --- a/src/cross_validation.py +++ b/src/cross_validation.py @@ -29,7 +29,7 @@ from datetime import timedelta from config import * from data import * from evaluation import * -from similarity import * +from dissimilarity import * from recommender import * from strategy import * from user import * diff --git a/src/data.py b/src/data.py index eb23dfc..732b1f0 100644 --- a/src/data.py +++ b/src/data.py @@ -28,36 +28,38 @@ import axi from debian import debtags import logging import hashlib +import random from error import Error from singleton import Singleton import cluster -from similarity import * - -class Item: - """ - Generic item definition. - """ - -class Package(Item): - """ - Definition of a GNU/Linux application as a recommender item. - """ - def __init__(self,package_name): - """ - Set initial attributes. - """ - self.package_name = package_name - -def normalize_tags(string): - """ - Substitute string characters : by _ and - by '. - Examples: - admin::package-management -> admin__package'management - implemented-in::c++ -> implemented-in__c++ - """ - return string.replace(':','_').replace('-','\'') - +from dissimilarity import * + +#class Item: +# """ +# Generic item definition. +# """ +# +#class Package(Item): +# """ +# Definition of a GNU/Linux application as a recommender item. +# """ +# def __init__(self,package_name): +# """ +# Set initial attributes. +# """ +# self.package_name = package_name +# +#def normalize_tags(string): +# """ +# Substitute string characters : by _ and - by '. +# Examples: +# admin::package-management -> admin__package'management +# implemented-in::c++ -> implemented-in__c++ +# """ +# return string.replace(':','_').replace('-','\'') + +#[FIXME] get pkg tags from axi and remove load_debtags_db method def load_debtags_db(db_path): """ Load debtags database from the source file. @@ -73,105 +75,105 @@ def load_debtags_db(db_path): logging.error("Could not load DebtagsDB from '%s'." % self.db_path) raise Error -class TagsXapianIndex(xapian.WritableDatabase,Singleton): - """ - Data source for tags info defined as a singleton xapian database. - """ - def __init__(self,cfg): - """ - Set initial attributes. - """ - self.path = os.path.expanduser(cfg.tags_index) - self.db_path = os.path.expanduser(cfg.tags_db) - self.debtags_db = debtags.DB() - - try: - db_file = open(self.db_path) - except IOError: - logging.error("Could not load DebtagsDB from '%s'." % self.db_path) - raise Error - md5 = hashlib.md5() - md5.update(db_file.read()) - self.db_md5 = md5.hexdigest() - db_file.close() - self.load_index(cfg.reindex) - -# def load_db(self): +#class TagsXapianIndex(xapian.WritableDatabase,Singleton): +# """ +# Data source for tags info defined as a singleton xapian database. +# """ +# def __init__(self,cfg): # """ -# Load debtags database from the source file. +# Set initial attributes. # """ -# tag_filter = re.compile(r"^special::.+$|^.+::TODO$") +# self.path = os.path.expanduser(cfg.tags_index) +# self.db_path = os.path.expanduser(cfg.tags_db) +# self.debtags_db = debtags.DB() # try: -# db_file = open(self.db_path, "r") -# self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) -# db_file.close() -# except: +# db_file = open(self.db_path) +# except IOError: # logging.error("Could not load DebtagsDB from '%s'." % self.db_path) # raise Error - - def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): - """ - Return most relevant tags considering a list of packages. - """ - if not self.debtags_db.package_count(): - self.debtags_db = load_debtags_db(self.db_path) - relevant_db = self.debtags_db.choose_packages(pkgs_list) - relevance_index = debtags.relevance_index_function(self.debtags_db, - relevant_db) - sorted_relevant_tags = sorted(relevant_db.iter_tags(), - lambda a, b: cmp(relevance_index(a), - relevance_index(b))) - return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:])) - - def load_index(self,reindex): - """ - Load an existing debtags index. - """ - if not reindex: - try: - logging.info("Opening existing debtags xapian index at \'%s\'" - % self.path) - xapian.Database.__init__(self,self.path) - md5 = self.get_metadata("md5") - if not md5 == self.db_md5: - logging.info("Index must be updated.") - reindex = 1 - except xapian.DatabaseError: - logging.info("Could not open debtags index.") - reindex =1 - - if reindex: - self.new_index() - - def new_index(self): - """ - Create a xapian index for debtags info based on 'debtags_db' and - place it at 'self.path'. - """ - if not os.path.exists(self.path): - os.makedirs(self.path) - - try: - logging.info("Indexing debtags info from \'%s\'" % - self.db_path) - logging.info("Creating new xapian index at \'%s\'" % - self.path) - xapian.WritableDatabase.__init__(self,self.path, - xapian.DB_CREATE_OR_OVERWRITE) - except xapian.DatabaseError: - logging.critical("Could not create xapian index.") - raise Error - - self.debtags_db = load_debtags_db(self.db_path) - self.set_metadata("md5",self.db_md5) - - for pkg,tags in self.debtags_db.iter_packages_tags(): - doc = xapian.Document() - doc.set_data(pkg) - for tag in tags: - doc.add_term(normalize_tags(tag)) - doc_id = self.add_document(doc) - logging.debug("Debtags Xapian: Indexing doc %d",doc_id) +# md5 = hashlib.md5() +# md5.update(db_file.read()) +# self.db_md5 = md5.hexdigest() +# db_file.close() +# self.load_index(cfg.reindex) +# +## def load_db(self): +## """ +## Load debtags database from the source file. +## """ +## tag_filter = re.compile(r"^special::.+$|^.+::TODO$") +## try: +## db_file = open(self.db_path, "r") +## self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) +## db_file.close() +## except: +## logging.error("Could not load DebtagsDB from '%s'." % self.db_path) +## raise Error +# +# def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): +# """ +# Return most relevant tags considering a list of packages. +# """ +# if not self.debtags_db.package_count(): +# #print "index vazio" +# self.debtags_db = load_debtags_db(self.db_path) +# relevant_db = self.debtags_db.choose_packages(pkgs_list) +# relevance_index = debtags.relevance_index_function(self.debtags_db, +# relevant_db) +# sorted_relevant_tags = sorted(relevant_db.iter_tags(), +# lambda a, b: cmp(relevance_index(a), +# relevance_index(b))) +# return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:])) +# +# def load_index(self,reindex): +# """ +# Load an existing debtags index. +# """ +# if not reindex: +# try: +# logging.info("Opening existing debtags xapian index at \'%s\'" +# % self.path) +# xapian.Database.__init__(self,self.path) +# md5 = self.get_metadata("md5") +# if not md5 == self.db_md5: +# logging.info("Index must be updated.") +# reindex = 1 +# except xapian.DatabaseError: +# logging.info("Could not open debtags index.") +# reindex =1 +# +# if reindex: +# self.new_index() +# +# def new_index(self): +# """ +# Create a xapian index for debtags info based on 'debtags_db' and +# place it at 'self.path'. +# """ +# if not os.path.exists(self.path): +# os.makedirs(self.path) +# +# try: +# logging.info("Indexing debtags info from \'%s\'" % +# self.db_path) +# logging.info("Creating new xapian index at \'%s\'" % +# self.path) +# xapian.WritableDatabase.__init__(self,self.path, +# xapian.DB_CREATE_OR_OVERWRITE) +# except xapian.DatabaseError: +# logging.critical("Could not create xapian index.") +# raise Error +# +# self.debtags_db = load_debtags_db(self.db_path) +# self.set_metadata("md5",self.db_md5) +# +# for pkg,tags in self.debtags_db.iter_packages_tags(): +# doc = xapian.Document() +# doc.set_data(pkg) +# for tag in tags: +# doc.add_term(normalize_tags(tag)) +# doc_id = self.add_document(doc) +# logging.debug("Debtags Xapian: Indexing doc %d",doc_id) class PopconXapianIndex(xapian.WritableDatabase,Singleton): """ @@ -232,7 +234,7 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): """ if not os.path.exists(self.path): os.makedirs(self.path) - debtags_db = load_debtags_db(self.debtags_path) + debtags_db = load_debtags_db(self.debtags_path) #[FIXME] try: logging.info("Indexing popcon submissions from \'%s\'" % @@ -254,6 +256,7 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): submission_path) for pkg, freq in self.parse_submission(submission_path): doc.add_term(pkg,freq) + #[FIXME] get tags from axi for tag in debtags_db.tags_of_package(pkg): doc.add_term("XT"+tag,freq) doc_id = self.add_document(doc) @@ -334,22 +337,27 @@ class PopconClusteredData(Singleton): s.add_pkg(pkg) self.submissions.append(s) - distanceFunction = JaccardIndex() - cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) - clusters = cl.getlevel(0.5) - for c in clusters: - print "cluster" - for submission in c: - print submission.hash - #cl = KMeansClusteringPopcon(self.submissions, - # lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) + distanceFunction = JaccardDistance() + # cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) + # clusters = cl.getlevel(0.5) + # for c in clusters: + # print "cluster" + # for submission in c: + # print submission.hash + cl = KMedoidsClusteringPopcon(self.submissions, lambda x,y: \ + distanceFunction(x.pkgs_list,y.pkgs_list)) #clusters = cl.getclusters(2) - #medoids = cl.getMedoids(2) + medoids = cl.getMedoids(2) + print "medoids" + for m in medoids: + print m.hash class KMedoidsClusteringPopcon(cluster.KMeansClustering): def __init__(self,data,distance): - cluster.KMeansClustering.__init__(self, data, distance) + if len(data)>100: + data_sample = random.sample(data,100) + cluster.KMeansClustering.__init__(self, data_sample, distance) self.distanceMatrix = {} for submission in self._KMeansClustering__data: self.distanceMatrix[submission.hash] = {} @@ -377,7 +385,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): for i in range(len(cluster)): totalDistance = sum(self.distanceMatrix[cluster[i].hash].values()) print "totalDistance[",i,"]=",totalDistance - if totalDistance < centroidDistance: + if totalDistance < medoidDistance: medoidDistance = totalDistance medoid = i print "medoidDistance:",medoidDistance diff --git a/src/dissimilarity.py b/src/dissimilarity.py index 3e48400..aacec6c 100644 --- a/src/dissimilarity.py +++ b/src/dissimilarity.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ - similarity - python module for classes and methods related to similarity - measuring between two sets of data. + dissimilarity - python module for classes and methods related to similarity + measuring between two sets of data. """ __author__ = "Tassia Camoes Araujo " __copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" diff --git a/src/strategy.py b/src/strategy.py index 307555c..a88ca25 100644 --- a/src/strategy.py +++ b/src/strategy.py @@ -144,30 +144,30 @@ class ItemReputationStrategy(RecommendationStrategy): logging.critical("Item reputation recommendation strategy is not yet implemented.") raise Error -class ContentBasedStrategy(RecommendationStrategy): - """ - Content-based recommendation strategy. - """ - def run(self,rec,user): - """ - Perform recommendation strategy. - """ - profile = user.txi_tag_profile(rec.items_repository,50) - qp = xapian.QueryParser() - query = qp.parse_query(profile) - enquire = xapian.Enquire(rec.items_repository) - enquire.set_query(query) - - try: - mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) - except xapian.DatabaseError as error: - logging.critical(error.get_msg()) - raise Error - - item_score = {} - for m in mset: - item_score[m.document.get_data()] = m.rank - return recommender.RecommendationResult(item_score,20) +#class ContentBasedStrategy(RecommendationStrategy): +# """ +# Content-based recommendation strategy. +# """ +# def run(self,rec,user): +# """ +# Perform recommendation strategy. +# """ +# profile = user.txi_tag_profile(rec.items_repository,50) +# qp = xapian.QueryParser() +# query = qp.parse_query(profile) +# enquire = xapian.Enquire(rec.items_repository) +# enquire.set_query(query) +# +# try: +# mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) +# except xapian.DatabaseError as error: +# logging.critical(error.get_msg()) +# raise Error +# +# item_score = {} +# for m in mset: +# item_score[m.document.get_data()] = m.rank +# return recommender.RecommendationResult(item_score,20) class AxiContentBasedStrategy(RecommendationStrategy): """ diff --git a/src/user.py b/src/user.py index 536f30c..20061f7 100644 --- a/src/user.py +++ b/src/user.py @@ -19,10 +19,12 @@ __license__ = """ along with this program. If not, see . """ +import random import commands import xapian import logging import apt +from singleton import Singleton class FilterTag(xapian.ExpandDecider): """ @@ -34,51 +36,84 @@ class FilterTag(xapian.ExpandDecider): """ return term[:2] == "XT" +class DemographicProfile(Singleton): + def __init__(self): + self.admin = set(["admin", "hardware", "mail", "protocol", + "network", "security", "web", "interface::web"]) + self.devel = set(["devel", "role::devel-lib", "role::shared-lib"]) + self.desktop = set(["x11", "accessibility", "game", "junior", "office", + "interface::x11"]) + self.art = set(["field::arts", "sound"]) + self.science = set(["science", "biology", "field::astronomy", + "field::aviation", "field::biology", + "field::chemistry", "field::eletronics", + "field::finance", "field::geography", + "field::geology", "field::linguistics", + "field::mathematics", "field::medicine", + "field::meteorology", "field::physics", + "field::statistics"]) + + def __call__(self,profiles_set): + demographic_profile = set() + for profile in profiles_set: + demographic_profile = (demographic_profile | eval("self."+profile,{},{"self":self})) + return demographic_profile + class User: """ Define a user of a recommender. """ - def __init__(self,item_score,user_id=0,demographic_profile=0): + def __init__(self,item_score,user_id=0,profiles_set=0): """ - Set initial parameters. + Set initial user attributes. If no user_id was passed as parameter, a + random md5-hash is generated for that purpose. If the demographic + profile was not defined, it defaults to 'desktop' """ - self.id = user_id self.item_score = item_score + if user_id: + self.id = user_id + else: + random.seed() + self.id = random.getrandbits(128) self.pkg_profile = self.item_score.keys() - self.demographic_profile = demographic_profile + if not profiles_set: + profiles_set = set(["desktop"]) + self.set_demographic_profile(profiles_set) + + def set_demographic_profile(self,profiles_set): + self.demographic_profile = DemographicProfile()(profiles_set) def items(self): """ - Return dictionary relating items and repective scores. + Return the set of user items. """ - return self.item_score.keys() + return set(self.item_score.keys()) def axi_tag_profile(self,apt_xapian_index,profile_size): """ Return most relevant tags for a list of packages based on axi. """ - terms = [] - for item in self.pkg_profile: - terms.append("XP"+item) + terms = ["XP"+item for item in self.pkg_profile] query = xapian.Query(xapian.Query.OP_OR, terms) enquire = xapian.Enquire(apt_xapian_index) enquire.set_query(query) rset = xapian.RSet() - for m in enquire.get_mset(0,30000): #consider all matches + for m in enquire.get_mset(0,apt_xapian_index.get_doccount()): rset.add_document(m.docid) + # statistically good differentiators between relevant and non-relevant eset = enquire.get_eset(profile_size, rset, FilterTag()) profile = [] for res in eset: profile.append(res.term) - logging.debug("%.2f %s" % (res.weight,res.term[2:])) + logging.debug("%.2f %s" % (res.weight,res.term.lstrip("XT"))) return profile - def txi_tag_profile(self,tags_xapian_index,profile_size): - """ - Return most relevant tags for a list of packages based on tags index. - """ - return tags_xapian_index.relevant_tags_from_db(self.pkg_profile, - profile_size) + #def txi_tag_profile(self,tags_xapian_index,profile_size): + # """ + # Return most relevant tags for a list of packages based on tags index. + # """ + # return tags_xapian_index.relevant_tags_from_db(self.pkg_profile, + # profile_size) def maximal_pkg_profile(self): """ -- libgit2 0.21.2