Commit ccd4ef5568d421a430f37975baff0bacf775b91c

Authored by Tássia Camões Araújo
1 parent e2be2c33
Exists in master and in 1 other branch add_vagrant

Renamed files.

src/experiments/k-suite.py
@@ -1,186 +0,0 @@ @@ -1,186 +0,0 @@
1 -#!/usr/bin/env python  
2 -"""  
3 - k-suite - experiment different neighborhood sizes  
4 -"""  
5 -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"  
6 -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"  
7 -__license__ = """  
8 - This program is free software: you can redistribute it and/or modify  
9 - it under the terms of the GNU General Public License as published by  
10 - the Free Software Foundation, either version 3 of the License, or  
11 - (at your option) any later version.  
12 -  
13 - This program is distributed in the hope that it will be useful,  
14 - but WITHOUT ANY WARRANTY; without even the implied warranty of  
15 - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
16 - GNU General Public License for more details.  
17 -  
18 - You should have received a copy of the GNU General Public License  
19 - along with this program. If not, see <http://www.gnu.org/licenses/>.  
20 -"""  
21 -  
22 -import sys  
23 -sys.path.insert(0,'../')  
24 -from config import Config  
25 -from data import PopconXapianIndex, PopconSubmission  
26 -from recommender import Recommender  
27 -from user import LocalSystem, User  
28 -from evaluation import *  
29 -import logging  
30 -import random  
31 -import Gnuplot  
32 -import numpy  
33 -  
34 -def plot_roc(k,roc_points,log_file):  
35 - g = Gnuplot.Gnuplot()  
36 - g('set style data points')  
37 - g.xlabel('False Positive Rate')  
38 - g.ylabel('True Positive Rate')  
39 - g('set xrange [0:1.0]')  
40 - g('set yrange [0:1.0]')  
41 - g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))  
42 - g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),  
43 - Gnuplot.Data(roc_points))  
44 - g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")  
45 - g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)  
46 -  
47 -def plot_summary(precision,f05,mcc,log_file):  
48 - g = Gnuplot.Gnuplot()  
49 - g('set style data lines')  
50 - g.xlabel('Neighborhood (k)')  
51 - g.title("Setup: %s-size20" % (log_file.split("/")[-1]))  
52 - g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),  
53 - Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),  
54 - Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))  
55 - g.hardcopy(log_file+(".png"),terminal="png")  
56 - g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)  
57 -  
58 -class ExperimentResults:  
59 - def __init__(self,repo_size):  
60 - self.repository_size = repo_size  
61 - self.precision = []  
62 - self.recall = []  
63 - self.fpr = []  
64 - self.f05 = []  
65 - self.mcc = []  
66 -  
67 - def add_result(self,ranking,sample):  
68 - predicted = RecommendationResult(dict.fromkeys(ranking,1))  
69 - real = RecommendationResult(sample)  
70 - evaluation = Evaluation(predicted,real,self.repository_size)  
71 - self.precision.append(evaluation.run(Precision()))  
72 - self.recall.append(evaluation.run(Recall()))  
73 - self.fpr.append(evaluation.run(FPR()))  
74 - self.f05.append(evaluation.run(F_score(0.5)))  
75 - self.mcc.append(evaluation.run(MCC()))  
76 -  
77 - def get_roc_point(self):  
78 - tpr = self.recall  
79 - fpr = self.fpr  
80 - if not tpr or not fpr:  
81 - return [0,0]  
82 - return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]  
83 -  
84 - def get_precision_summary(self):  
85 - if not self.precision: return 0  
86 - return sum(self.precision)/len(self.precision)  
87 -  
88 - def get_f05_summary(self):  
89 - if not self.f05: return 0  
90 - return sum(self.f05)/len(self.f05)  
91 -  
92 - def get_mcc_summary(self):  
93 - if not self.mcc: return 0  
94 - return sum(self.mcc)/len(self.mcc)  
95 -  
96 -if __name__ == '__main__':  
97 - if len(sys.argv)<3:  
98 - print "Usage: k-suite strategy_str sample_file"  
99 - exit(1)  
100 - threshold = 20  
101 - iterations = 30  
102 - neighbors = [3,5,10,50,100,150,200,300,400,500]  
103 - cfg = Config()  
104 - cfg.strategy = sys.argv[1]  
105 - sample_file = sys.argv[2]  
106 - population_sample = []  
107 - with open(sample_file,'r') as f:  
108 - for line in f.readlines():  
109 - user_id = line.strip('\n')  
110 - population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))  
111 - # setup dictionaries and files  
112 - roc_summary = {}  
113 - recommended = {}  
114 - precision_summary = {}  
115 - f05_summary = {}  
116 - mcc_summary = {}  
117 - sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])  
118 - if not os.path.exists(sample_dir):  
119 - os.makedirs(sample_dir)  
120 - log_file = os.path.join(sample_dir,cfg.strategy)  
121 - with open(log_file,'w') as f:  
122 - f.write("# %s\n\n" % sample_file.split('/')[-1])  
123 - f.write("# strategy %s recommendation_size %d iterations %d\n\n" %  
124 - (cfg.strategy,threshold,iterations))  
125 - f.write("# k coverage \tprecision \tf05 \tmcc\n\n")  
126 -  
127 - for k in neighbors:  
128 - roc_summary[k] = []  
129 - recommended[k] = set()  
130 - precision_summary[k] = []  
131 - f05_summary[k] = []  
132 - mcc_summary[k] = []  
133 - with open(log_file+"-k%.3d"%k,'w') as f:  
134 - f.write("# %s\n\n" % sample_file.split('/')[-1])  
135 - f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))  
136 - f.write("# roc_point \tprecision \tf05 \tmcc\n\n")  
137 -  
138 - # main loop per user  
139 - for submission_file in population_sample:  
140 - user = PopconSystem(submission_file)  
141 - user.filter_pkg_profile(cfg.pkgs_filter)  
142 - user.maximal_pkg_profile()  
143 - for k in neighbors:  
144 - cfg.k_neighbors = k  
145 - rec = Recommender(cfg)  
146 - repo_size = rec.items_repository.get_doccount()  
147 - results = ExperimentResults(repo_size)  
148 - # n iterations for same recommender and user  
149 - for n in range(iterations):  
150 - # Fill sample profile  
151 - profile_len = len(user.pkg_profile)  
152 - item_score = {}  
153 - for pkg in user.pkg_profile:  
154 - item_score[pkg] = user.item_score[pkg]  
155 - sample = {}  
156 - sample_size = int(profile_len*0.9)  
157 - for i in range(sample_size):  
158 - key = random.choice(item_score.keys())  
159 - sample[key] = item_score.pop(key)  
160 - iteration_user = User(item_score)  
161 - recommendation = rec.get_recommendation(iteration_user,threshold)  
162 - if hasattr(recommendation,"ranking"):  
163 - results.add_result(recommendation.ranking,sample)  
164 - recommended[k] = recommended[k].union(recommendation.ranking)  
165 - # save summary  
166 - roc_point = results.get_roc_point()  
167 - roc_summary[k].append(roc_point)  
168 - precision = results.get_precision_summary()  
169 - precision_summary[k].append(precision)  
170 - f05 = results.get_f05_summary()  
171 - f05_summary[k].append(f05)  
172 - mcc = results.get_mcc_summary()  
173 - mcc_summary[k].append(mcc)  
174 - with open(log_file+"-k%.3d"%k,'a') as f:  
175 - f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %  
176 - (roc_point[0],roc_point[1],precision,f05,mcc))  
177 - # back to main flow  
178 - with open(log_file,'a') as f:  
179 - plot_summary(precision_summary,f05_summary,mcc_summary,log_file)  
180 - for k in neighbors:  
181 - coverage = len(recommended[size])/float(repo_size)  
182 - f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %  
183 - (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),  
184 - float(sum(f05_summary[k]))/len(f05_summary[k]),  
185 - float(sum(mcc_summary[k]))/len(mcc_summary[k])))  
186 - plot_roc(k,roc_summary[k],log_file)  
src/experiments/roc-suite.py
@@ -1,231 +0,0 @@ @@ -1,231 +0,0 @@
1 -#!/usr/bin/env python  
2 -"""  
3 - recommender suite - recommender experiments suite  
4 -"""  
5 -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"  
6 -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"  
7 -__license__ = """  
8 - This program is free software: you can redistribute it and/or modify  
9 - it under the terms of the GNU General Public License as published by  
10 - the Free Software Foundation, either version 3 of the License, or  
11 - (at your option) any later version.  
12 -  
13 - This program is distributed in the hope that it will be useful,  
14 - but WITHOUT ANY WARRANTY; without even the implied warranty of  
15 - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
16 - GNU General Public License for more details.  
17 -  
18 - You should have received a copy of the GNU General Public License  
19 - along with this program. If not, see <http://www.gnu.org/licenses/>.  
20 -"""  
21 -  
22 -import sys  
23 -sys.path.insert(0,'../')  
24 -from config import Config  
25 -from data import PopconXapianIndex, PopconSubmission  
26 -from recommender import Recommender  
27 -from user import LocalSystem, User  
28 -from evaluation import *  
29 -import logging  
30 -import random  
31 -import Gnuplot  
32 -import numpy  
33 -  
34 -#iterations = 3  
35 -#sample_proportions = [0.9]  
36 -#weighting = [('bm25',1.2)]  
37 -#collaborative = ['knn_eset']  
38 -#content_based = ['cb']  
39 -#hybrid = ['knnco']  
40 -#profile_size = [50,100]  
41 -#popcon_size = ["1000"]  
42 -#neighbors = [50]  
43 -  
44 -iterations = 30  
45 -sample_proportions = [0.9]  
46 -weighting = [('bm25',1.0)]  
47 -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']  
48 -collaborative = ['knn_eset','knn','knn_plus']  
49 -hybrid = ['knnco','knnco_eset']  
50 -profile_size = range(20,200,40)  
51 -neighbors = range(10,510,50)  
52 -  
53 -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):  
54 - # Write recall log  
55 - output = open(("%s-%.2d" % (log_file,n)),'w')  
56 - output.write("# %s-n\n" % label["description"])  
57 - output.write("# %s-%.2d\n" % (label["values"],n))  
58 - output.write("\n# repository profile sample\n%d %d %d\n" % \  
59 - (repo_size,profile_size,len(sample)))  
60 - if hasattr(recommendation,"ranking"):  
61 - notfound = []  
62 - ranks = []  
63 - for pkg in sample.keys():  
64 - if pkg in recommendation.ranking:  
65 - ranks.append(recommendation.ranking.index(pkg))  
66 - else:  
67 - notfound.append(pkg)  
68 - for r in sorted(ranks):  
69 - output.write(str(r)+"\n")  
70 - if notfound:  
71 - output.write("# out of recommendation:\n")  
72 - for pkg in notfound:  
73 - output.write(pkg+"\n")  
74 - output.close()  
75 -  
76 -def plot_roc(roc_points,eauc,c,p,log_file):  
77 - g = Gnuplot.Gnuplot()  
78 - g('set style data lines')  
79 - g.xlabel('False Positive Rate')  
80 - g.ylabel('True Positive Rate')  
81 - g('set xrange [0:1.0]')  
82 - g('set yrange [0:1.0]')  
83 - g.title("Setup: %s" % log_file.split("/")[-1])  
84 - g('set label "C %.2f" at 0.8,0.25' % c)  
85 - g('set label "P(20) %.2f" at 0.8,0.2' % p)  
86 - g('set label "AUC %.4f" at 0.8,0.15' % eauc)  
87 - g.plot(Gnuplot.Data(roc_points,title="ROC"),  
88 - Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))  
89 - #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))  
90 - g.hardcopy(log_file+"-roc.png",terminal="png")  
91 - g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)  
92 -  
93 -def get_label(cfg,sample_proportion):  
94 - label = {}  
95 - if cfg.strategy in content_based:  
96 - label["description"] = "strategy-profile"  
97 - label["values"] = ("%s-profile%.3d" %  
98 - (cfg.strategy,cfg.profile_size))  
99 - elif cfg.strategy in collaborative:  
100 - label["description"] = "strategy-knn"  
101 - label["values"] = ("%s-k%.3d" %  
102 - (cfg.strategy,cfg.k_neighbors))  
103 - elif cfg.strategy in hybrid:  
104 - label["description"] = "strategy-knn-profile"  
105 - label["values"] = ("%s-k%.3d-profile%.3d" %  
106 - (cfg.strategy,cfg.k_neighbors,cfg.profile_size))  
107 - else:  
108 - print "Unknown strategy"  
109 - return label  
110 -  
111 -class ExperimentResults:  
112 - def __init__(self,repo_size):  
113 - self.repository_size = repo_size  
114 - self.precision = {}  
115 - self.recall = {}  
116 - self.fpr = {}  
117 - points = [1]+range(10,self.repository_size,10)  
118 - self.recommended = set()  
119 - for size in points:  
120 - self.precision[size] = []  
121 - self.recall[size] = []  
122 - self.fpr[size] = []  
123 -  
124 - def add_result(self,ranking,sample):  
125 - self.recommended = self.recommended.union(ranking)  
126 - # get data only for point  
127 - for size in self.precision.keys():  
128 - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))  
129 - real = RecommendationResult(sample)  
130 - evaluation = Evaluation(predicted,real,self.repository_size)  
131 - self.precision[size].append(evaluation.run(Precision()))  
132 - self.recall[size].append(evaluation.run(Recall()))  
133 - self.fpr[size].append(evaluation.run(FPR()))  
134 -  
135 - # Average ROC by threshold (= size of recommendation)  
136 - def get_roc_points(self):  
137 - points = []  
138 - for size in self.recall.keys():  
139 - tpr = self.recall[size]  
140 - fpr = self.fpr[size]  
141 - points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])  
142 - return sorted(points)  
143 -  
144 -def run_strategy(cfg,user):  
145 - for weight in weighting:  
146 - cfg.weight = weight[0]  
147 - cfg.bm25_k1 = weight[1]  
148 - rec = Recommender(cfg)  
149 - repo_size = rec.items_repository.get_doccount()  
150 - for proportion in sample_proportions:  
151 - results = ExperimentResults(repo_size)  
152 - label = get_label(cfg,proportion)  
153 - user_dir = ("results/roc-suite/%s" % user.user_id[:8])  
154 - if not os.path.exists(user_dir):  
155 - os.mkdir(user_dir)  
156 - log_file = os.path.join(user_dir,label["values"])  
157 - for n in range(iterations):  
158 - # Fill sample profile  
159 - profile_len = len(user.pkg_profile)  
160 - item_score = {}  
161 - for pkg in user.pkg_profile:  
162 - item_score[pkg] = user.item_score[pkg]  
163 - sample = {}  
164 - sample_size = int(profile_len*proportion)  
165 - for i in range(sample_size):  
166 - key = random.choice(item_score.keys())  
167 - sample[key] = item_score.pop(key)  
168 - iteration_user = User(item_score)  
169 - recommendation = rec.get_recommendation(iteration_user,repo_size)  
170 - write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)  
171 - if hasattr(recommendation,"ranking"):  
172 - results.add_result(recommendation.ranking,sample)  
173 - with open(log_file,'w') as f:  
174 - roc_points = results.get_roc_points()  
175 - x_coord = [p[0] for p in roc_points]  
176 - y_coord = [p[1] for p in roc_points]  
177 - auc = numpy.trapz(y=y_coord, x=x_coord)  
178 - eauc = (auc+  
179 - numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+  
180 - numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))  
181 - precision_20 = sum(results.precision[10])/len(results.precision[10])  
182 - coverage = len(results.recommended)/float(repo_size)  
183 - f.write("# %s\n# %s\n\n" %  
184 - (label["description"],label["values"]))  
185 - f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %  
186 - (coverage,precision_20,auc,eauc))  
187 - plot_roc(roc_points,eauc,coverage,precision_20,log_file)  
188 -  
189 -def run_content(user,cfg):  
190 - for strategy in content_based:  
191 - cfg.strategy = strategy  
192 - for size in profile_size:  
193 - cfg.profile_size = size  
194 - run_strategy(cfg,user)  
195 -  
196 -def run_collaborative(user,cfg):  
197 - popcon_desktopapps = cfg.popcon_desktopapps  
198 - popcon_programs = cfg.popcon_programs  
199 - for strategy in collaborative:  
200 - cfg.strategy = strategy  
201 - for k in neighbors:  
202 - cfg.k_neighbors = k  
203 - run_strategy(cfg,user)  
204 -  
205 -def run_hybrid(user,cfg):  
206 - popcon_desktopapps = cfg.popcon_desktopapps  
207 - popcon_programs = cfg.popcon_programs  
208 - for strategy in hybrid:  
209 - cfg.strategy = strategy  
210 - for k in neighbors:  
211 - cfg.k_neighbors = k  
212 - for size in profile_size:  
213 - cfg.profile_size = size  
214 - run_strategy(cfg,user)  
215 -  
216 -if __name__ == '__main__':  
217 - if len(sys.argv)<2:  
218 - print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]"  
219 - exit(1)  
220 -  
221 - cfg = Config()  
222 - user = PopconSystem(sys.argv[1])  
223 - user.filter_pkg_profile(cfg.pkgs_filter)  
224 - user.maximal_pkg_profile()  
225 -  
226 - if "content" in sys.argv or len(sys.argv)<3:  
227 - run_content(user,cfg)  
228 - if "collaborative" in sys.argv or len(sys.argv)<3:  
229 - run_collaborative(user,cfg)  
230 - if "hybrid" in sys.argv or len(sys.argv)<3:  
231 - run_hybrid(user,cfg)  
src/experiments/strategies-suite.py
@@ -1,274 +0,0 @@ @@ -1,274 +0,0 @@
1 -#!/usr/bin/env python  
2 -"""  
3 - recommender suite - recommender experiments suite  
4 -"""  
5 -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"  
6 -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"  
7 -__license__ = """  
8 - This program is free software: you can redistribute it and/or modify  
9 - it under the terms of the GNU General Public License as published by  
10 - the Free Software Foundation, either version 3 of the License, or  
11 - (at your option) any later version.  
12 -  
13 - This program is distributed in the hope that it will be useful,  
14 - but WITHOUT ANY WARRANTY; without even the implied warranty of  
15 - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
16 - GNU General Public License for more details.  
17 -  
18 - You should have received a copy of the GNU General Public License  
19 - along with this program. If not, see <http://www.gnu.org/licenses/>.  
20 -"""  
21 -  
22 -import sys  
23 -sys.path.insert(0,'../')  
24 -from config import Config  
25 -from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex  
26 -from recommender import Recommender  
27 -from user import LocalSystem, User  
28 -from evaluation import *  
29 -import logging  
30 -import random  
31 -import Gnuplot  
32 -  
33 -#iterations = 3  
34 -#sample_proportions = [0.9]  
35 -#weighting = [('bm25',1.2)]  
36 -#collaborative = ['knn']  
37 -#content_based = []  
38 -#hybrid = ['knnco']  
39 -#profile_size = [50,100]  
40 -#popcon_size = ["1000"]  
41 -#neighbors = [50]  
42 -  
43 -iterations = 10  
44 -sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]  
45 -weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]  
46 -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']  
47 -collaborative = ['knn_eset','knn','knn_plus']  
48 -hybrid = ['knnco','knnco_eset']  
49 -  
50 -profile_size = range(20,100,20)  
51 -#popcon_size = [1000,10000,50000,'full']  
52 -neighbors = range(10,510,50)  
53 -  
54 -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):  
55 - # Write recall log  
56 - output = open(("%s-%d" % (log_file,n)),'w')  
57 - output.write("# %s-n\n" % label["description"])  
58 - output.write("# %s-%d\n" % (label["values"],n))  
59 - output.write("\n%d %d %d\n" % \  
60 - (repo_size,profile_size,len(sample)))  
61 - if hasattr(recommendation,"ranking"):  
62 - notfound = []  
63 - ranks = []  
64 - for pkg in sample.keys():  
65 - if pkg in recommendation.ranking:  
66 - ranks.append(recommendation.ranking.index(pkg))  
67 - else:  
68 - notfound.append(pkg)  
69 - for r in sorted(ranks):  
70 - output.write(str(r)+"\n")  
71 - if notfound:  
72 - output.write("Out of recommendation:\n")  
73 - for pkg in notfound:  
74 - output.write(pkg+"\n")  
75 - output.close()  
76 -  
77 -def plot_summary(precision,recall,f1,f05,accuracy,log_file):  
78 - # Plot metrics summary  
79 - g = Gnuplot.Gnuplot()  
80 - g('set style data lines')  
81 - g.xlabel('Recommendation size')  
82 - g.title("Setup: %s" % log_file.split("/")[-1])  
83 - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),  
84 - Gnuplot.Data(precision,title="Precision"),  
85 - Gnuplot.Data(recall,title="Recall"),  
86 - Gnuplot.Data(f1,title="F_1"),  
87 - Gnuplot.Data(f05,title="F_0.5"))  
88 - g.hardcopy(log_file+".png",terminal="png")  
89 - g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)  
90 - g('set logscale x')  
91 - g('replot')  
92 - g.hardcopy(log_file+"-logscale.png",terminal="png")  
93 - g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)  
94 -  
95 -def get_label(cfg,sample_proportion):  
96 - label = {}  
97 - if cfg.strategy in content_based:  
98 - label["description"] = "strategy-filter-profile-k1_bm25-sample"  
99 - label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %  
100 - (cfg.strategy,cfg.profile_size,  
101 - cfg.pkgs_filter.split("/")[-1],  
102 - cfg.bm25_k1,sample_proportion))  
103 - elif cfg.strategy in collaborative:  
104 - label["description"] = "strategy-knn-filter-k1_bm25-sample"  
105 - label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %  
106 - (cfg.strategy,cfg.k_neighbors,  
107 - cfg.pkgs_filter.split("/")[-1],  
108 - cfg.bm25_k1,sample_proportion))  
109 - elif cfg.strategy in hybrid:  
110 - label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"  
111 - label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %  
112 - (cfg.strategy,cfg.k_neighbors,cfg.profile_size,  
113 - cfg.pkgs_filter.split("/")[-1],  
114 - cfg.bm25_k1,sample_proportion))  
115 - else:  
116 - print "Unknown strategy"  
117 - return label  
118 -  
119 -class ExperimentResults:  
120 - def __init__(self,repo_size):  
121 - self.repository_size = repo_size  
122 - self.accuracy = {}  
123 - self.precision = {}  
124 - self.recall = {}  
125 - self.f1 = {}  
126 - self.f05 = {}  
127 - points = [1]+range(10,200,10)+range(200,self.repository_size,100)  
128 - for size in points:  
129 - self.accuracy[size] = []  
130 - self.precision[size] = []  
131 - self.recall[size] = []  
132 - self.f1[size] = []  
133 - self.f05[size] = []  
134 -  
135 - def add_result(self,ranking,sample):  
136 - for size in self.accuracy.keys():  
137 - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))  
138 - real = RecommendationResult(sample)  
139 - evaluation = Evaluation(predicted,real,self.repository_size)  
140 - self.accuracy[size].append(evaluation.run(Accuracy()))  
141 - self.precision[size].append(evaluation.run(Precision()))  
142 - self.recall[size].append(evaluation.run(Recall()))  
143 - self.f1[size].append(evaluation.run(F_score(1)))  
144 - self.f05[size].append(evaluation.run(F_score(0.5)))  
145 -  
146 - def get_precision_summary(self):  
147 - summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]  
148 - return sorted(summary)  
149 -  
150 - def get_recall_summary(self):  
151 - summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]  
152 - return sorted(summary)  
153 -  
154 - def get_f1_summary(self):  
155 - summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]  
156 - return sorted(summary)  
157 -  
158 - def get_f05_summary(self):  
159 - summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]  
160 - return sorted(summary)  
161 -  
162 - def get_accuracy_summary(self):  
163 - summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]  
164 - return sorted(summary)  
165 -  
166 - def best_precision(self):  
167 - size = max(self.precision, key = lambda x: max(self.precision[x]))  
168 - return (size,max(self.precision[size]))  
169 -  
170 - def best_f1(self):  
171 - size = max(self.f1, key = lambda x: max(self.f1[x]))  
172 - return (size,max(self.f1[size]))  
173 -  
174 - def best_f05(self):  
175 - size = max(self.f05, key = lambda x: max(self.f05[x]))  
176 - return (size,max(self.f05[size]))  
177 -  
178 -def run_strategy(cfg,user):  
179 - for weight in weighting:  
180 - cfg.weight = weight[0]  
181 - cfg.bm25_k1 = weight[1]  
182 - rec = Recommender(cfg)  
183 - repo_size = rec.items_repository.get_doccount()  
184 - for proportion in sample_proportions:  
185 - results = ExperimentResults(repo_size)  
186 - label = get_label(cfg,proportion)  
187 - log_file = "results/strategies/"+label["values"]  
188 - for n in range(iterations):  
189 - # Fill sample profile  
190 - profile_size = len(user.pkg_profile)  
191 - item_score = {}  
192 - for pkg in user.pkg_profile:  
193 - item_score[pkg] = user.item_score[pkg]  
194 - sample = {}  
195 - sample_size = int(profile_size*proportion)  
196 - for i in range(sample_size):  
197 - key = random.choice(item_score.keys())  
198 - sample[key] = item_score.pop(key)  
199 - iteration_user = User(item_score)  
200 - recommendation = rec.get_recommendation(iteration_user,repo_size)  
201 - write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)  
202 - if hasattr(recommendation,"ranking"):  
203 - results.add_result(recommendation.ranking,sample)  
204 - with open(log_file,'w') as f:  
205 - precision_10 = sum(results.precision[10])/len(results.precision[10])  
206 - f1_10 = sum(results.f1[10])/len(results.f1[10])  
207 - f05_10 = sum(results.f05[10])/len(results.f05[10])  
208 - f.write("# %s\n# %s\n\ncoverage %d\n\n" %  
209 - (label["description"],label["values"],recommendation.size))  
210 - f.write("# best results (recommendation size; metric)\n")  
211 - f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %  
212 - (results.best_precision()[0],results.best_precision()[1],  
213 - results.best_f1()[0],results.best_f1()[1],  
214 - results.best_f05()[0],results.best_f05()[1]))  
215 - f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %  
216 - (precision_10,f1_10,f05_10))  
217 - precision = results.get_precision_summary()  
218 - recall = results.get_recall_summary()  
219 - f1 = results.get_f1_summary()  
220 - f05 = results.get_f05_summary()  
221 - accuracy = results.get_accuracy_summary()  
222 - plot_summary(precision,recall,f1,f05,accuracy,log_file)  
223 -  
224 -def run_content(user,cfg):  
225 - for strategy in content_based:  
226 - cfg.strategy = strategy  
227 - for size in profile_size:  
228 - cfg.profile_size = size  
229 - run_strategy(cfg,user)  
230 -  
231 -def run_collaborative(user,cfg):  
232 - popcon_desktopapps = cfg.popcon_desktopapps  
233 - popcon_programs = cfg.popcon_programs  
234 - for strategy in collaborative:  
235 - cfg.strategy = strategy  
236 - for k in neighbors:  
237 - cfg.k_neighbors = k  
238 - #for size in popcon_size:  
239 - # if size:  
240 - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size  
241 - # cfg.popcon_programs = popcon_programs+"_"+size  
242 - run_strategy(cfg,user)  
243 -  
244 -def run_hybrid(user,cfg):  
245 - popcon_desktopapps = cfg.popcon_desktopapps  
246 - popcon_programs = cfg.popcon_programs  
247 - for strategy in hybrid:  
248 - cfg.strategy = strategy  
249 - for k in neighbors:  
250 - cfg.k_neighbors = k  
251 - #for size in popcon_size:  
252 - # if size:  
253 - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size  
254 - # cfg.popcon_programs = popcon_programs+"_"+size  
255 - for size in profile_size:  
256 - cfg.profile_size = size  
257 - run_strategy(cfg,user)  
258 -  
259 -if __name__ == '__main__':  
260 - #user = LocalSystem()  
261 - #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))  
262 -  
263 - cfg = Config()  
264 - user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")  
265 - #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")  
266 - user.filter_pkg_profile(cfg.pkgs_filter)  
267 - user.maximal_pkg_profile()  
268 -  
269 - if "content" in sys.argv or len(sys.argv)<2:  
270 - run_content(user,cfg)  
271 - if "collaborative" in sys.argv or len(sys.argv)<2:  
272 - run_collaborative(user,cfg)  
273 - if "hybrid" in sys.argv or len(sys.argv)<2:  
274 - run_hybrid(user,cfg)