diff --git a/src/evaluation.py b/src/evaluation.py index 3697618..99f8644 100644 --- a/src/evaluation.py +++ b/src/evaluation.py @@ -294,6 +294,10 @@ class CrossValidation: round_user = User(cross_item_score) result_size = int(self.recommender.items_repository.get_doccount()* self.result_proportion) + logging.debug("size %d" % result_size) + if not result_size: + logging.critical("Recommendation size is zero.") + raise Error predicted_result = self.recommender.get_recommendation(round_user,result_size) if not predicted_result.size: logging.critical("No recommendation produced. Abort cross-validation.") diff --git a/src/experiments/strategies-suite.py b/src/experiments/strategies-suite.py index d1a7a24..b67a9ce 100755 --- a/src/experiments/strategies-suite.py +++ b/src/experiments/strategies-suite.py @@ -30,121 +30,117 @@ import logging import random import Gnuplot -def run_iteration(label,cfg,sample_proportion,n): +def write_recall_log(label,sample,recommendation,log_file): + # Write recall log + output = open(log_file,'w') + output.write("# %s\n" % label["description"]) + output.write("# %s\n" % label["values"]) + notfound = [] + ranks = [] + for pkg in sample.keys(): + if pkg in recommendation.ranking: + ranks.append(recommendation.ranking.index(pkg)) + else: + notfound.append(pkg) + for r in sorted(ranks): + output.write(str(r)+"\n") + if notfound: + output.write("Out of recommendation:\n") + for pkg in notfound: + output.write(pkg+"\n") + output.close() + +def plot_summary(sample,recommendation,repo_size,log_file): + # Plot metrics summary + accuracy = [] + precision = [] + recall = [] + f1 = [] + g = Gnuplot.Gnuplot() + g('set style data lines') + g.xlabel('Recommendation size') + for size in range(1,len(recommendation.ranking)+1,100): + predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1)) + real = RecommendationResult(sample) + evaluation = Evaluation(predicted,real,repo_size) + accuracy.append([size,evaluation.run(Accuracy())]) + precision.append([size,evaluation.run(Precision())]) + recall.append([size,evaluation.run(Recall())]) + f1.append([size,evaluation.run(F1())]) + + g.plot(Gnuplot.Data(accuracy,title="Accuracy"), + Gnuplot.Data(precision,title="Precision"), + Gnuplot.Data(recall,title="Recall"), + Gnuplot.Data(f1,title="F1")) + g.hardcopy(log_file+"-plot.ps", terminal="postscript") + g.hardcopy(log_file+"-plot.ps", terminal="postscript") + +def run_iteration(user,cfg,label,sample): rec = Recommender(cfg) repo_size = rec.items_repository.get_doccount() - user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters,"desktop")) - print "profile",user.pkg_profile - user.maximal_pkg_profile() - sample_size = int(len(user.pkg_profile)*sample_proportion) - for n in range(iteration): - item_score = dict.fromkeys(user.pkg_profile,1) - # Prepare partition - sample = {} - for i in range(sample_size): - key = random.choice(item_score.keys()) - sample[key] = item_score.pop(key) - # Get full recommendation - user = User(item_score) - recommendation = rec.get_recommendation(user,repo_size) - # Write recall log - log_file = "results/strategies/"+label["values"] - output = open(log_file,'w') - output.write("# %s\n" % label["description"]) - output.write("# %s\n" % label["values"]) - notfound = [] - ranks = [] - for pkg in sample.keys(): - if pkg in recommendation.ranking: - ranks.append(recommendation.ranking.index(pkg)) - else: - notfound.append(pkg) - for r in sorted(ranks): - output.write(str(r)+"\n") - if notfound: - output.write("Out of recommendation:\n") - for pkg in notfound: - output.write(pkg+"\n") - output.close() - # Plot metrics summary - accuracy = [] - precision = [] - recall = [] - f1 = [] - g = Gnuplot.Gnuplot() - g('set style data lines') - g.xlabel('Recommendation size') - for size in range(1,len(recommendation.ranking)+1,100): - predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1)) - real = RecommendationResult(sample) - evaluation = Evaluation(predicted,real,repo_size) - accuracy.append([size,evaluation.run(Accuracy())]) - precision.append([size,evaluation.run(Precision())]) - recall.append([size,evaluation.run(Recall())]) - f1.append([size,evaluation.run(F1())]) - - g.plot(Gnuplot.Data(accuracy,title="Accuracy"), - Gnuplot.Data(precision,title="Precision"), - Gnuplot.Data(recall,title="Recall"), - Gnuplot.Data(f1,title="F1")) - g.hardcopy(log_file+"-plot.ps", enhanced=1, color=1) + recommendation = rec.get_recommendation(user,repo_size) + log_file = "results/strategies/"+label["values"] + write_recall_log(label,sample,recommendation,log_file) + plot_summary(sample,recommendation,repo_size,log_file) +def run_strategies(user,sample,n): + cfg = Config() + label = {} + sample_proportion = (len(sample)/len(user.pkg_profile)+len(sample)) + for k in bm25_k1: + cfg.bm25_k1 = k + if "content" in sys.argv or len(sys.argv)<2: + for size in profile_size: + cfg.profile_size = size + for strategy in content_based: + cfg.strategy = strategy + label["description"] = "k1_bm25-profile-strategy-sample-n" + label["values"] = ("%.2f-%d-%s-%.2f-%d" % + (cfg.bm25_k1,cfg.profile_size, + cfg.strategy,sample_proportion,n)) + run_iteration(user,cfg,label,sample) + if "colaborative" in sys.argv or len(sys.argv)<2: + for strategy in collaborative: + cfg.strategy = strategy + for size in popcon_size: + cfg.popcon_desktopapps = cfg.popcon_desktopapps+size + cfg.popcon_programs = cfg.popcon_programs+size + for k in neighbors: + cfg.k_neighbors = k + k_str = "k"+str(cfg.k_neighbors) + label["description"] = "k1_bm25-popcon-strategy-k-sample-n" + label["values"] = ("%.2f-%s-%s-%s-%.2f-%d" % + (cfg.bm25_k1,str(popcon_size),cfg.strategy, + k_str,sample_proportion,n)) + run_iteration(user,cfg,label,sample) if __name__ == '__main__': - iteration = 10 + iterations = 10 samples_proportion = [0.5, 0.6, 0.7, 0.8, 0.9] weights = ['bm25', 'trad'] - cb_strategies = ['cb','cbt','cbd'] - #cb_strategies = [] - profile_size = range(10,100,10) - items_repository = ["data/AppAxi","/var/lib/apt-xapian-index/index"] - users_repository = ["data/popcon_index_full","data/popcon_index-50000", - "data/popcon_index_10000","data/popcon_index_1000"] - users_repository = [] - neighbors = range(10,1010,100) + bm25_k1 = [1.0, 1.2, 1.4, 1.6, 1.8, 2.0] + content_based = ['cb','cbt','cbd','cbh', + 'cb_eset','cbt_eset','cbd_eset','cbh_eset'] + collaborative = ['knn','knn_plus','knn_eset'] + hybrid = ['knnco','knnco_eset'] - cfg = Config() - cfg.index_mode = "old" - label = {} + profile_size = range(10,100,10) + popcon_size = [1000,10000,50000,'full'] + neighbors = range(10,510,100) - for w in weights: - cfg.weight = w - for items_repo in items_repository: - cfg.axi = items_repo - if "App" in cfg.axi: - axi_str = "axiapp" - else: - axi_str = "axifull" - for sample_proportion in samples_proportion: - if "content" in sys.argv or len(sys.argv)<2: - for size in profile_size: - cfg.profile_size = size - for strategy in cb_strategies: - cfg.strategy = strategy - for n in range(iteration): - label["description"] = "weight-axi-profile-strategy-sample-n" - label["values"] = ("%s-%s-%d-%s-%.2f-%d" % - (cfg.weight,axi_str,cfg.profile_size, - cfg.strategy,sample_proportion,n)) - run_iteration(label,cfg,sample_proportion,n) - if "colaborative" in sys.argv or len(sys.argv)<2: - cfg.strategy = "col" - for users_repo in users_repository: - cfg.popcon_index = users_repo - for k in neighbors: - cfg.k_neighbors = k - for n in range(iteration): - k_str = "k"+str(cfg.k_neighbors) - if "full" in cfg.popcon_index: - popcon_str = "popfull" - if "50000" in cfg.popcon_index: - popcon_str = "pop50000" - if "10000" in cfg.popcon_index: - popcon_str = "pop10000" - if "1000" in cfg.popcon_index: - popcon_str = "pop1000" - label["description"] = "weight-axi-popcon-profile-strategy-k-sample-n" - label["values"] = ("%s-%s-%s-%d-%s-%s-%.2f-%d" % - (cfg.weight,axi_str,popcon_str,cfg.profile_size, - cfg.strategy,k_str,sample_proportion,n)) - run_iteration(label,cfg,sample_proportion,n) + user = LocalSystem() + #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps")) + user.maximal_pkg_profile() + for sample_proportion in samples_proportion: + for n in range(iterations): + # Fill user profile + item_score = {} + for pkg in user.pkg_profile: + item_score[pkg] = user.item_score[pkg] + # Prepare partition sample + sample = {} + sample_size = int(len(user.pkg_profile)*sample_proportion) + for i in range(sample_size): + key = random.choice(item_score.keys()) + sample[key] = item_score.pop(key) + run_strategies(User(item_score),sample,n) -- libgit2 0.21.2