From 9332e65ed65a9b72a2db23629c03129ba499f8ca Mon Sep 17 00:00:00 2001 From: Tássia Camões Araújo Date: Tue, 9 Aug 2011 07:10:37 +0000 Subject: [PATCH] Fixed logging bugs, improved filtering by file, new PkgListSystem class, survey bug fixes, and some other work-in-progress updates. --- src/bin/get_highinst.py | 10 ---------- src/bin/get_pkgs_inst.py | 15 +++++++++++++++ src/recommender.py | 10 +++++++++- src/strategy.py | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------------------------- src/user.py | 59 +++++++++++++++++++++++++++++++++++++++++++++++------------ src/web/survey.py | 77 +++++++++++++++++++++++++++++++++++++++++++++++++---------------------------- 6 files changed, 211 insertions(+), 120 deletions(-) delete mode 100755 src/bin/get_highinst.py create mode 100755 src/bin/get_pkgs_inst.py diff --git a/src/bin/get_highinst.py b/src/bin/get_highinst.py deleted file mode 100755 index bd34628..0000000 --- a/src/bin/get_highinst.py +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env python - -if __name__ == '__main__': - with open("/root/org/popcon.debian.org/popcon-mail/results") as results: - for line in results.readlines(): - if line.startswith("Package"): - fields = line.split() - inst = int(fields[2])+int(fields[3])+int(fields[4]) - if inst > 20: - print fields[1], inst diff --git a/src/bin/get_pkgs_inst.py b/src/bin/get_pkgs_inst.py new file mode 100755 index 0000000..9f18465 --- /dev/null +++ b/src/bin/get_pkgs_inst.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python + +from operator import itemgetter +if __name__ == '__main__': + pkgs_inst = {} + with open("/root/org/popcon.debian.org/popcon-mail/results") as results: + for line in results: + if line.startswith("Package"): + fields = line.split() + inst = int(fields[2])+int(fields[3])+int(fields[4]) + if inst > 20: + pkgs_inst[fields[1]] = inst + sorted_by_inst = sorted(pkgs_inst.items(), key=itemgetter(1)) + for pkg, inst in sorted_by_inst: + print pkg, inst diff --git a/src/recommender.py b/src/recommender.py index abc9151..4477c7a 100644 --- a/src/recommender.py +++ b/src/recommender.py @@ -19,6 +19,8 @@ __license__ = """ along with this program. If not, see . """ +import logging +import os import xapian import operator import data @@ -73,14 +75,20 @@ class Recommender: self.weight = xapian.BM25Weight() else: self.weight = xapian.TradWeight() + self.valid_pkgs = [] + # file format: one pkg_name per line + with open(os.path.join(cfg.filters,cfg.pkgs_filter)) as valid_pkgs: + self.valid_pkgs = [line.strip() for line in valid_pkgs + if not line.startswith("#")] def set_strategy(self,strategy_str): """ Set the recommendation strategy. """ + logging.info("Setting recommender strategy to \'%s\'" % strategy_str) self.items_repository = xapian.Database(self.cfg.axi) if "desktop" in strategy_str: - self.items_repository = xapian.Database("/root/.app-recommender/DesktopAxi") + self.items_repository = xapian.Database("/root/.app-recommender/axi_desktop") self.cfg.popcon_index = "/root/.app-recommender/popcon-index_desktop_1000" if strategy_str == "cb" or strategy_str == "cb_desktop": diff --git a/src/strategy.py b/src/strategy.py index 9982c2c..c3582e0 100644 --- a/src/strategy.py +++ b/src/strategy.py @@ -40,85 +40,102 @@ class PkgMatchDecider(xapian.MatchDecider): """ True if the package is not already installed. """ - return doc.get_data() not in self.pkgs_list - -class AppMatchDecider(xapian.MatchDecider): - """ - Extend xapian.MatchDecider to not consider only applications packages. - """ - def __init__(self, pkgs_list, axi): - """ - Set initial parameters. - """ - xapian.MatchDecider.__init__(self) - self.pkgs_list = pkgs_list - self.axi = axi - - def __call__(self, doc): - """ - True if the package is not already installed. - """ - tags = axi_search_pkg_tags(self.axi,doc.get_data()) - return (("XTrole::program" in tags) and - (doc.get_data() not in self.pkgs_list)) - -class UserMatchDecider(xapian.MatchDecider): - """ - Extend xapian.MatchDecider to match similar profiles. - """ - - def __init__(self, profile): - """ - Set initial parameters. - """ - xapian.MatchDecider.__init__(self) - self.profile = profile - - def __call__(self, doc): - """ - True if the user has more the half of packages from profile. - """ - match=0 - for term in doc: - if term.term in self.profile: - match = match+1 - return (match >= len(self.profile)/2) + pkg = doc.get_data() + is_new = pkg not in self.pkgs_list + if "kde" in pkg: + return is_new and "kde" in self.pkgs_list + if "gnome" in pkg: + return is_new and "gnome" in self.pkgs_list + return is_new class PkgExpandDecider(xapian.ExpandDecider): """ Extend xapian.ExpandDecider to consider packages only. """ - def __call__(self, term): + def __init__(self, pkgs_list): """ - True if the term is a package. + Set initial parameters. """ - # [FIXME] return term.startswith("XP") - #return not term.startswith("XT") - return term.startswith("XP") - -class AppExpandDecider(xapian.ExpandDecider): - """ - Extend xapian.ExpandDecider to consider applications only. - """ - def __init__(self,axi): xapian.ExpandDecider.__init__(self) - self.axi = axi + self.pkgs_list = pkgs_list def __call__(self, term): """ True if the term is a package. """ - if not term.startswith("XT"): - package = term.lstrip("XP") - print package - tags = axi_search_pkg_tags(self.axi,package) - if "XTrole::program" in tags: - print tags - return True - else: - return False - else: - return False + pkg = term.lstrip("XP") + is_new_pkg = pkg not in self.pkgs_list and term.startswith("XP") + if "kde" in pkg: + return is_new_pkg and "kde" in self.pkgs_list + if "gnome" in pkg: + return is_new_pkg and "gnome" in self.pkgs_list + return is_new_pkg + +#class AppMatchDecider(xapian.MatchDecider): +# """ +# Extend xapian.MatchDecider to not consider only applications packages. +# """ +# def __init__(self, pkgs_list, axi): +# """ +# Set initial parameters. +# """ +# xapian.MatchDecider.__init__(self) +# self.pkgs_list = pkgs_list +# self.axi = axi +# +# def __call__(self, doc): +# """ +# True if the package is not already installed. +# """ +# tags = axi_search_pkg_tags(self.axi,doc.get_data()) +# return (("XTrole::program" in tags) and +# (doc.get_data() not in self.pkgs_list)) +# +#class UserMatchDecider(xapian.MatchDecider): +# """ +# Extend xapian.MatchDecider to match similar profiles. +# """ +# +# def __init__(self, profile): +# """ +# Set initial parameters. +# """ +# xapian.MatchDecider.__init__(self) +# self.profile = profile +# +# def __call__(self, doc): +# """ +# True if the user has more the half of packages from profile. +# """ +# match=0 +# for term in doc: +# if term.term in self.profile: +# match = match+1 +# return (match >= len(self.profile)/2) + +#class AppExpandDecider(xapian.ExpandDecider): +# """ +# Extend xapian.ExpandDecider to consider applications only. +# """ +# def __init__(self,axi): +# xapian.ExpandDecider.__init__(self) +# self.axi = axi +# +# def __call__(self, term): +# """ +# True if the term is a package. +# """ +# if not term.startswith("XT"): +# package = term.lstrip("XP") +# print package +# tags = axi_search_pkg_tags(self.axi,package) +# if "XTrole::program" in tags: +# print tags +# return True +# else: +# return False +# else: +# return False class TagExpandDecider(xapian.ExpandDecider): """ @@ -149,8 +166,10 @@ class ContentBasedStrategy(RecommendationStrategy): """ Perform recommendation strategy. """ + logging.debug("Composing user profile...") profile = user.content_profile(rec.items_repository,self.content, self.profile_size) + logging.debug(profile) # prepair index for querying user profile query = xapian.Query(xapian.Query.OP_OR,profile) enquire = xapian.Enquire(rec.items_repository) @@ -188,8 +207,10 @@ class CollaborativeStrategy(RecommendationStrategy): """ Perform recommendation strategy. """ + logging.debug("Composing user profile...") profile = ["XP"+package for package in - user.filter_pkg_profile("/root/.app-recommender/filters/program")] + user.filter_pkg_profile(rec.valid_pkgs)] + logging.debug(profile) # prepair index for querying user profile query = xapian.Query(xapian.Query.OP_OR,profile) enquire = xapian.Enquire(rec.users_repository) @@ -208,13 +229,14 @@ class CollaborativeStrategy(RecommendationStrategy): # retrieve most relevant packages #eset = enquire.get_eset(recommendation_size,rset, # AppExpandDecider(rec.items_repository)) - eset = enquire.get_eset(recommendation_size,rset,PkgExpandDecider()) + eset = enquire.get_eset(recommendation_size,rset, + PkgExpandDecider(user.items())) # compose result dictionary item_score = {} ranking = [] for e in eset: package = e.term.lstrip("XP") - tags = axi_search_pkg_tags(rec.items_repository,package) + #tags = axi_search_pkg_tags(rec.items_repository,package) #[FIXME] set this constraint somehow #if "XTrole::program" in tags: item_score[package] = e.weight diff --git a/src/user.py b/src/user.py index 11a7ec8..ac007e4 100644 --- a/src/user.py +++ b/src/user.py @@ -26,6 +26,7 @@ import datetime import xapian import logging import apt +from error import Error from singleton import Singleton import data @@ -113,9 +114,14 @@ class User: Get user profile for a specific type of content: packages tags, description or both (full_profile) """ - if content == "tag": return self.tag_profile(items_repository,size) - if content == "desc": return self.desc_profile(items_repository,size) - if content == "full": return self.full_profile(items_repository,size) + if content == "tag": + profile = self.tag_profile(items_repository,size) + if content == "desc": + profile = self.desc_profile(items_repository,size) + if content == "full": + profile = self.full_profile(items_repository,size) + logging.debug("User profile: %s" % profile) + return profile def tag_profile(self,items_repository,size): """ @@ -155,17 +161,28 @@ class User: desc_profile = self.desc_profile(items_repository,size)[:size/2] return tag_profile+desc_profile - def filter_pkg_profile(self,filter_file): + def filter_pkg_profile(self,filter_list_or_file): """ Return list of packages from profile listed in the filter_file. """ + if type(filter_list_or_file).__name__ == "list": + valid_pkgs = filter_list_or_file + elif type(filter_list_or_file).__name__ == "str": + try: + with open(filter_list_or_file) as valid: + valid_pkgs = [line.strip() for line in valid] + except IOError: + logging.critical("Could not open profile filter file.") + raise Error + else: + logging.debug("No filter provided for user profiling.") + return self.pkg_profile + old_profile_size = len(self.pkg_profile) - with open(filter_file) as valid: - valid_pkgs = [line.strip() for line in valid] - for pkg in self.pkg_profile[:]: #iterate list copy - if pkg not in valid_pkgs: - self.pkg_profile.remove(pkg) - logging.debug("Discarded package %s during profile filtering" % pkg) + for pkg in self.pkg_profile[:]: #iterate list copy + if pkg not in valid_pkgs: + self.pkg_profile.remove(pkg) + logging.debug("Discarded package %s during profile filtering" % pkg) profile_size = len(self.pkg_profile) logging.debug("Filtered package profile: reduced packages profile size \ from %d to %d." % (old_profile_size, profile_size)) @@ -199,7 +216,6 @@ class RandomPopcon(User): """ Set initial parameters. """ - item_score = {} len_profile = 0 while len_profile < 100: path = random.choice([os.path.join(root, submission) for @@ -217,10 +233,29 @@ class PopconSystem(User): """ Set initial parameters. """ - item_score = {} submission = data.PopconSubmission(path) User.__init__(self,submission.packages,submission.user_id) +class PkgsListSystem(User): + def __init__(self,pkgs_list_or_file): + """ + Set initial parameters. + """ + if type(pkgs_list_or_file).__name__ == "list": + pkgs_list = filter_list_or_file + elif type(pkgs_list_or_file).__name__ == "str": + try: + with open(pkgs_list_or_file) as pkgs_list_file: + pkgs_list = [line.split()[0] for line in pkgs_list_file] + except IOError: + logging.critical("Could not open packages list file.") + raise Error + else: + logging.debug("No packages provided for user profiling.") + return self.pkg_profile + + User.__init__(self,dict.fromkeys(pkgs_list,1)) + class LocalSystem(User): """ Extend the class User to consider the packages installed on the local diff --git a/src/web/survey.py b/src/web/survey.py index 1a60e24..052ee52 100755 --- a/src/web/survey.py +++ b/src/web/survey.py @@ -11,7 +11,8 @@ import re sys.path.insert(0,"../") -from config import * +import logging +from config import Config from recommender import * from user import * @@ -30,7 +31,7 @@ class Thanks: web_input = web.input() user_id = web_input['user_id'].encode('utf8') with open("./submissions/%s/ident" % user_id,'w') as ident: - for key in ["name","email","country","public","comments"]: + for key in ["name","email","comments"]: if web_input.has_key(key): ident.write("%s: %s\n" % (key,web_input[key].encode("utf-8"))) return render.thanks_id() @@ -79,22 +80,30 @@ class Package: class Request: def __init__(self,web_input,submissions_dir,user_id=0,pkgs_list=0): self.strategy = "" - print "Request from user",user_id if user_id: self.user_id = user_id self.outputdir = os.path.join(submissions_dir,user_id) + logging.info("New round for user %s" % self.user_id) else: self.outputdir = tempfile.mkdtemp(prefix='',dir=submissions_dir) - print ("created dir %s" % self.outputdir) self.user_id = self.outputdir.lstrip(submissions_dir) + logging.info("Request from user %s" % self.user_id) + logging.debug("Created dir %s" % self.outputdir) + pkgs_list_file = os.path.join(self.outputdir,"packages_list") if pkgs_list: self.pkgs_list = pkgs_list + if not os.path.exists(pkgs_list_file): + with open(pkgs_list_file,"w") as f: + for pkg in pkgs_list: + f.write(pkg+"\n") else: self.pkgs_list = [] if web_input['pkgs_file'].value: - f = open(self.outputdir + "/packages_list", "wb") + f = open(pkgs_list_file, "w") lines = web_input['pkgs_file'].file.readlines() + with open(os.path.join(self.outputdir,"upload"), "w") as upload: + upload.writelines(lines) # popcon submission format if lines[0].startswith('POPULARITY-CONTEST'): del lines[0] @@ -122,15 +131,15 @@ class Request: class Save: def POST(self): web_input = web.input() - print web_input + logging.info("Saving user evaluation...") + logging.info(web_input) user_id = web_input['user_id'].encode('utf8') with open("./submissions/%s/packages_list" % user_id) as packages_list: pkgs_list = [line.strip() for line in packages_list.readlines()] strategy = web_input['strategy'] - print user_id,strategy,pkgs_list - output_dir = "./submissions/%s/%s/" % (user_id,strategy) - if not os.path.exists(output_dir): - os.makedirs(output_dir) + logging.debug("Saving evaluation for user %s, strategy %s and packages..." + % (user_id,strategy)) + logging.debug(pkgs_list) evaluations = {} evaluations["poor"] = [] evaluations["good"] = [] @@ -138,16 +147,17 @@ class Save: for key, value in web_input.items(): if key.startswith("evaluation-"): evaluations[value.encode('utf8')].append(key.lstrip("evaluation-")) + output_dir = ("./submissions/%s/%s/" % (user_id,strategy)) for key,value in evaluations.items(): - with open(output_dir+key,'w') as output: + with open(os.path.join(output_dir,key),'w') as output: for item in value: output.write(item+"\n") - with open(output_dir+"report",'w') as report: + with open(os.path.join(output_dir,"report"),'w') as report: report.write("# User: %s\n# Strategy: %s\n# TP FP\n%d %d\n" % (user_id,strategy, len(evaluations["good"])+len(evaluations["surprising"]), len(evaluations["poor"]))) - if web_input.has_key('strategy_button'): + if web_input.has_key('continue_button'): return Survey().POST() elif web_input.has_key('finish_button'): return render.thanks(user_id) @@ -156,23 +166,21 @@ class Save: class Survey: def __init__(self): - self.strategies = ["cb","cbd","cbt","col","cb-desktop","cbd-desktop", - "cbt-desktop","col-desktop"] + logging.info("Setting up survey...") self.rec = Recommender(Config()) - #print rec.users_repository.get_doccount() self.submissions_dir = "./submissions/" if not os.path.exists(self.submissions_dir): os.makedirs(self.submissions_dir) def POST(self): web_input = web.input(pkgs_file={}) - print "WEB_INPUT",web_input + logging.debug("Survey web_input %s" % str(web_input)) + self.strategies = ["cb","cbd","cbt","col"] # If it is not the first strategy round, save the previous evaluation if not web_input.has_key('user_id'): request = Request(web_input,self.submissions_dir) else: user_id = web_input['user_id'].encode('utf8') - print "Continue", user_id with open("./submissions/%s/packages_list" % user_id) as packages_list: pkgs_list = [line.strip() for line in packages_list.readlines()] request = Request(web_input,self.submissions_dir,user_id,pkgs_list) @@ -180,30 +188,43 @@ class Survey: return render.error_survey() else: user = User(dict.fromkeys(request.pkgs_list,1),request.user_id) - user.maximal_pkg_profile() - results = dict() + program_profile = user.filter_pkg_profile(os.path.join(self.rec.cfg.filters,"program")) + desktop_profile = user.filter_pkg_profile(os.path.join(self.rec.cfg.filters,"desktop")) + if (len(desktop_profile)>10 or + len(desktop_profile)>len(program_profile)/2): + self.strategies = [strategy_str+"_desktop" for strategy_str + in self.strategies[:]] old_strategies = [dirs for root, dirs, files in os.walk(os.path.join(self.submissions_dir, request.user_id))] if old_strategies: strategies = [s for s in self.strategies if s not in old_strategies[0]] - print "OLD Strategies", old_strategies[0] + logging.info("Already used strategies %s" % old_strategies[0]) else: strategies = self.strategies - print "LEFT",strategies if not strategies: return render.thanks(user_id) request.strategy = random.choice(strategies) - print "selected",request.strategy + logging.info("Selected \'%s\' from %s" % (request.strategy,strategies)) self.rec.set_strategy(request.strategy) prediction = self.rec.get_recommendation(user,10).get_prediction() - print prediction + logging.info("Prediction for user %s" % user.user_id) + logging.info(str(prediction)) + output_dir = ("./submissions/%s/%s/" % + (user.user_id,request.strategy)) + os.makedirs(output_dir) + with open(os.path.join(output_dir,"prediction"),"w") as prediction_file: + for pkg,rating in prediction: + prediction_file.write("%s %f.2\n" % (pkg,rating)) + logging.debug("Saved %s/%s prediction to file" % + (user.user_id,request.strategy)) recommendation = [result[0] for result in prediction] pkg_summaries = {} pkg_details = [] cache = apt.Cache() for pkg in recommendation: try: + logging.debug("Getting details of package %s" % pkg) pkg_details.append(Package().get_details_from_dde(pkg)) pkg_summaries[pkg] = cache[pkg].candidate.summary except: @@ -236,7 +257,7 @@ urls = ('/', 'Index', web.webapi.internalerror = web.debugerror if __name__ == "__main__": - apprec = web.application(urls, globals()) - apprec.add_processor(add_global_hook()) - apprec.run() - + cfg = Config() + apprec = web.application(urls, globals()) + apprec.add_processor(add_global_hook()) + apprec.run() -- libgit2 0.21.2