diff --git a/src/experiments/deprecated/k-suite.py b/src/experiments/deprecated/k-suite.py new file mode 100755 index 0000000..ab9f3b9 --- /dev/null +++ b/src/experiments/deprecated/k-suite.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python +""" + k-suite - experiment different neighborhood sizes +""" +__author__ = "Tassia Camoes Araujo " +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" +__license__ = """ + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +""" + +import sys +sys.path.insert(0,'../') +from config import Config +from data import PopconXapianIndex, PopconSubmission +from recommender import Recommender +from user import LocalSystem, User +from evaluation import * +import logging +import random +import Gnuplot +import numpy + +def plot_roc(k,roc_points,log_file): + g = Gnuplot.Gnuplot() + g('set style data points') + g.xlabel('False Positive Rate') + g.ylabel('True Positive Rate') + g('set xrange [0:1.0]') + g('set yrange [0:1.0]') + g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k)) + g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"), + Gnuplot.Data(roc_points)) + g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png") + g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1) + +def plot_summary(precision,f05,mcc,log_file): + g = Gnuplot.Gnuplot() + g('set style data lines') + g.xlabel('Neighborhood (k)') + g.title("Setup: %s-size20" % (log_file.split("/")[-1])) + g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"), + Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"), + Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC")) + g.hardcopy(log_file+(".png"),terminal="png") + g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1) + +class ExperimentResults: + def __init__(self,repo_size): + self.repository_size = repo_size + self.precision = [] + self.recall = [] + self.fpr = [] + self.f05 = [] + self.mcc = [] + + def add_result(self,ranking,sample): + predicted = RecommendationResult(dict.fromkeys(ranking,1)) + real = RecommendationResult(sample) + evaluation = Evaluation(predicted,real,self.repository_size) + self.precision.append(evaluation.run(Precision())) + self.recall.append(evaluation.run(Recall())) + self.fpr.append(evaluation.run(FPR())) + self.f05.append(evaluation.run(F_score(0.5))) + self.mcc.append(evaluation.run(MCC())) + + def get_roc_point(self): + tpr = self.recall + fpr = self.fpr + if not tpr or not fpr: + return [0,0] + return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)] + + def get_precision_summary(self): + if not self.precision: return 0 + return sum(self.precision)/len(self.precision) + + def get_f05_summary(self): + if not self.f05: return 0 + return sum(self.f05)/len(self.f05) + + def get_mcc_summary(self): + if not self.mcc: return 0 + return sum(self.mcc)/len(self.mcc) + +if __name__ == '__main__': + if len(sys.argv)<3: + print "Usage: k-suite strategy_str sample_file" + exit(1) + threshold = 20 + iterations = 30 + neighbors = [3,5,10,50,100,150,200,300,400,500] + cfg = Config() + cfg.strategy = sys.argv[1] + sample_file = sys.argv[2] + population_sample = [] + with open(sample_file,'r') as f: + for line in f.readlines(): + user_id = line.strip('\n') + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id)) + # setup dictionaries and files + roc_summary = {} + recommended = {} + precision_summary = {} + f05_summary = {} + mcc_summary = {} + sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1]) + if not os.path.exists(sample_dir): + os.makedirs(sample_dir) + log_file = os.path.join(sample_dir,cfg.strategy) + with open(log_file,'w') as f: + f.write("# %s\n\n" % sample_file.split('/')[-1]) + f.write("# strategy %s recommendation_size %d iterations %d\n\n" % + (cfg.strategy,threshold,iterations)) + f.write("# k coverage \tprecision \tf05 \tmcc\n\n") + + for k in neighbors: + roc_summary[k] = [] + recommended[k] = set() + precision_summary[k] = [] + f05_summary[k] = [] + mcc_summary[k] = [] + with open(log_file+"-k%.3d"%k,'w') as f: + f.write("# %s\n\n" % sample_file.split('/')[-1]) + f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k)) + f.write("# roc_point \tprecision \tf05 \tmcc\n\n") + + # main loop per user + for submission_file in population_sample: + user = PopconSystem(submission_file) + user.filter_pkg_profile(cfg.pkgs_filter) + user.maximal_pkg_profile() + for k in neighbors: + cfg.k_neighbors = k + rec = Recommender(cfg) + repo_size = rec.items_repository.get_doccount() + results = ExperimentResults(repo_size) + # n iterations for same recommender and user + for n in range(iterations): + # Fill sample profile + profile_len = len(user.pkg_profile) + item_score = {} + for pkg in user.pkg_profile: + item_score[pkg] = user.item_score[pkg] + sample = {} + sample_size = int(profile_len*0.9) + for i in range(sample_size): + key = random.choice(item_score.keys()) + sample[key] = item_score.pop(key) + iteration_user = User(item_score) + recommendation = rec.get_recommendation(iteration_user,threshold) + if hasattr(recommendation,"ranking"): + results.add_result(recommendation.ranking,sample) + recommended[k] = recommended[k].union(recommendation.ranking) + # save summary + roc_point = results.get_roc_point() + roc_summary[k].append(roc_point) + precision = results.get_precision_summary() + precision_summary[k].append(precision) + f05 = results.get_f05_summary() + f05_summary[k].append(f05) + mcc = results.get_mcc_summary() + mcc_summary[k].append(mcc) + with open(log_file+"-k%.3d"%k,'a') as f: + f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" % + (roc_point[0],roc_point[1],precision,f05,mcc)) + # back to main flow + with open(log_file,'a') as f: + plot_summary(precision_summary,f05_summary,mcc_summary,log_file) + for k in neighbors: + coverage = len(recommended[size])/float(repo_size) + f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" % + (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]), + float(sum(f05_summary[k]))/len(f05_summary[k]), + float(sum(mcc_summary[k]))/len(mcc_summary[k]))) + plot_roc(k,roc_summary[k],log_file) diff --git a/src/experiments/deprecated/strategies-suite.py b/src/experiments/deprecated/strategies-suite.py new file mode 100755 index 0000000..1c84aa3 --- /dev/null +++ b/src/experiments/deprecated/strategies-suite.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python +""" + recommender suite - recommender experiments suite +""" +__author__ = "Tassia Camoes Araujo " +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" +__license__ = """ + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +""" + +import sys +sys.path.insert(0,'../') +from config import Config +from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex +from recommender import Recommender +from user import LocalSystem, User +from evaluation import * +import logging +import random +import Gnuplot + +#iterations = 3 +#sample_proportions = [0.9] +#weighting = [('bm25',1.2)] +#collaborative = ['knn'] +#content_based = [] +#hybrid = ['knnco'] +#profile_size = [50,100] +#popcon_size = ["1000"] +#neighbors = [50] + +iterations = 10 +sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9] +weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)] +content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset'] +collaborative = ['knn_eset','knn','knn_plus'] +hybrid = ['knnco','knnco_eset'] + +profile_size = range(20,100,20) +#popcon_size = [1000,10000,50000,'full'] +neighbors = range(10,510,50) + +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file): + # Write recall log + output = open(("%s-%d" % (log_file,n)),'w') + output.write("# %s-n\n" % label["description"]) + output.write("# %s-%d\n" % (label["values"],n)) + output.write("\n%d %d %d\n" % \ + (repo_size,profile_size,len(sample))) + if hasattr(recommendation,"ranking"): + notfound = [] + ranks = [] + for pkg in sample.keys(): + if pkg in recommendation.ranking: + ranks.append(recommendation.ranking.index(pkg)) + else: + notfound.append(pkg) + for r in sorted(ranks): + output.write(str(r)+"\n") + if notfound: + output.write("Out of recommendation:\n") + for pkg in notfound: + output.write(pkg+"\n") + output.close() + +def plot_summary(precision,recall,f1,f05,accuracy,log_file): + # Plot metrics summary + g = Gnuplot.Gnuplot() + g('set style data lines') + g.xlabel('Recommendation size') + g.title("Setup: %s" % log_file.split("/")[-1]) + g.plot(Gnuplot.Data(accuracy,title="Accuracy"), + Gnuplot.Data(precision,title="Precision"), + Gnuplot.Data(recall,title="Recall"), + Gnuplot.Data(f1,title="F_1"), + Gnuplot.Data(f05,title="F_0.5")) + g.hardcopy(log_file+".png",terminal="png") + g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1) + g('set logscale x') + g('replot') + g.hardcopy(log_file+"-logscale.png",terminal="png") + g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1) + +def get_label(cfg,sample_proportion): + label = {} + if cfg.strategy in content_based: + label["description"] = "strategy-filter-profile-k1_bm25-sample" + label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" % + (cfg.strategy,cfg.profile_size, + cfg.pkgs_filter.split("/")[-1], + cfg.bm25_k1,sample_proportion)) + elif cfg.strategy in collaborative: + label["description"] = "strategy-knn-filter-k1_bm25-sample" + label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" % + (cfg.strategy,cfg.k_neighbors, + cfg.pkgs_filter.split("/")[-1], + cfg.bm25_k1,sample_proportion)) + elif cfg.strategy in hybrid: + label["description"] = "strategy-knn-filter-profile-k1_bm25-sample" + label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" % + (cfg.strategy,cfg.k_neighbors,cfg.profile_size, + cfg.pkgs_filter.split("/")[-1], + cfg.bm25_k1,sample_proportion)) + else: + print "Unknown strategy" + return label + +class ExperimentResults: + def __init__(self,repo_size): + self.repository_size = repo_size + self.accuracy = {} + self.precision = {} + self.recall = {} + self.f1 = {} + self.f05 = {} + points = [1]+range(10,200,10)+range(200,self.repository_size,100) + for size in points: + self.accuracy[size] = [] + self.precision[size] = [] + self.recall[size] = [] + self.f1[size] = [] + self.f05[size] = [] + + def add_result(self,ranking,sample): + for size in self.accuracy.keys(): + predicted = RecommendationResult(dict.fromkeys(ranking[:size],1)) + real = RecommendationResult(sample) + evaluation = Evaluation(predicted,real,self.repository_size) + self.accuracy[size].append(evaluation.run(Accuracy())) + self.precision[size].append(evaluation.run(Precision())) + self.recall[size].append(evaluation.run(Recall())) + self.f1[size].append(evaluation.run(F_score(1))) + self.f05[size].append(evaluation.run(F_score(0.5))) + + def get_precision_summary(self): + summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()] + return sorted(summary) + + def get_recall_summary(self): + summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()] + return sorted(summary) + + def get_f1_summary(self): + summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()] + return sorted(summary) + + def get_f05_summary(self): + summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()] + return sorted(summary) + + def get_accuracy_summary(self): + summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()] + return sorted(summary) + + def best_precision(self): + size = max(self.precision, key = lambda x: max(self.precision[x])) + return (size,max(self.precision[size])) + + def best_f1(self): + size = max(self.f1, key = lambda x: max(self.f1[x])) + return (size,max(self.f1[size])) + + def best_f05(self): + size = max(self.f05, key = lambda x: max(self.f05[x])) + return (size,max(self.f05[size])) + +def run_strategy(cfg,user): + for weight in weighting: + cfg.weight = weight[0] + cfg.bm25_k1 = weight[1] + rec = Recommender(cfg) + repo_size = rec.items_repository.get_doccount() + for proportion in sample_proportions: + results = ExperimentResults(repo_size) + label = get_label(cfg,proportion) + log_file = "results/strategies/"+label["values"] + for n in range(iterations): + # Fill sample profile + profile_size = len(user.pkg_profile) + item_score = {} + for pkg in user.pkg_profile: + item_score[pkg] = user.item_score[pkg] + sample = {} + sample_size = int(profile_size*proportion) + for i in range(sample_size): + key = random.choice(item_score.keys()) + sample[key] = item_score.pop(key) + iteration_user = User(item_score) + recommendation = rec.get_recommendation(iteration_user,repo_size) + write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file) + if hasattr(recommendation,"ranking"): + results.add_result(recommendation.ranking,sample) + with open(log_file,'w') as f: + precision_10 = sum(results.precision[10])/len(results.precision[10]) + f1_10 = sum(results.f1[10])/len(results.f1[10]) + f05_10 = sum(results.f05[10])/len(results.f05[10]) + f.write("# %s\n# %s\n\ncoverage %d\n\n" % + (label["description"],label["values"],recommendation.size)) + f.write("# best results (recommendation size; metric)\n") + f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" % + (results.best_precision()[0],results.best_precision()[1], + results.best_f1()[0],results.best_f1()[1], + results.best_f05()[0],results.best_f05()[1])) + f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" % + (precision_10,f1_10,f05_10)) + precision = results.get_precision_summary() + recall = results.get_recall_summary() + f1 = results.get_f1_summary() + f05 = results.get_f05_summary() + accuracy = results.get_accuracy_summary() + plot_summary(precision,recall,f1,f05,accuracy,log_file) + +def run_content(user,cfg): + for strategy in content_based: + cfg.strategy = strategy + for size in profile_size: + cfg.profile_size = size + run_strategy(cfg,user) + +def run_collaborative(user,cfg): + popcon_desktopapps = cfg.popcon_desktopapps + popcon_programs = cfg.popcon_programs + for strategy in collaborative: + cfg.strategy = strategy + for k in neighbors: + cfg.k_neighbors = k + #for size in popcon_size: + # if size: + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size + # cfg.popcon_programs = popcon_programs+"_"+size + run_strategy(cfg,user) + +def run_hybrid(user,cfg): + popcon_desktopapps = cfg.popcon_desktopapps + popcon_programs = cfg.popcon_programs + for strategy in hybrid: + cfg.strategy = strategy + for k in neighbors: + cfg.k_neighbors = k + #for size in popcon_size: + # if size: + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size + # cfg.popcon_programs = popcon_programs+"_"+size + for size in profile_size: + cfg.profile_size = size + run_strategy(cfg,user) + +if __name__ == '__main__': + #user = LocalSystem() + #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps")) + + cfg = Config() + user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7") + #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623") + user.filter_pkg_profile(cfg.pkgs_filter) + user.maximal_pkg_profile() + + if "content" in sys.argv or len(sys.argv)<2: + run_content(user,cfg) + if "collaborative" in sys.argv or len(sys.argv)<2: + run_collaborative(user,cfg) + if "hybrid" in sys.argv or len(sys.argv)<2: + run_hybrid(user,cfg) -- libgit2 0.21.2