Commit ccd4ef5568d421a430f37975baff0bacf775b91c
1 parent
e2be2c33
Exists in
master
and in
1 other branch
Renamed files.
Showing
3 changed files
with
0 additions
and
691 deletions
Show diff stats
src/experiments/k-suite.py
... | ... | @@ -1,186 +0,0 @@ |
1 | -#!/usr/bin/env python | |
2 | -""" | |
3 | - k-suite - experiment different neighborhood sizes | |
4 | -""" | |
5 | -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | -__license__ = """ | |
8 | - This program is free software: you can redistribute it and/or modify | |
9 | - it under the terms of the GNU General Public License as published by | |
10 | - the Free Software Foundation, either version 3 of the License, or | |
11 | - (at your option) any later version. | |
12 | - | |
13 | - This program is distributed in the hope that it will be useful, | |
14 | - but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | - GNU General Public License for more details. | |
17 | - | |
18 | - You should have received a copy of the GNU General Public License | |
19 | - along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | -""" | |
21 | - | |
22 | -import sys | |
23 | -sys.path.insert(0,'../') | |
24 | -from config import Config | |
25 | -from data import PopconXapianIndex, PopconSubmission | |
26 | -from recommender import Recommender | |
27 | -from user import LocalSystem, User | |
28 | -from evaluation import * | |
29 | -import logging | |
30 | -import random | |
31 | -import Gnuplot | |
32 | -import numpy | |
33 | - | |
34 | -def plot_roc(k,roc_points,log_file): | |
35 | - g = Gnuplot.Gnuplot() | |
36 | - g('set style data points') | |
37 | - g.xlabel('False Positive Rate') | |
38 | - g.ylabel('True Positive Rate') | |
39 | - g('set xrange [0:1.0]') | |
40 | - g('set yrange [0:1.0]') | |
41 | - g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k)) | |
42 | - g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"), | |
43 | - Gnuplot.Data(roc_points)) | |
44 | - g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png") | |
45 | - g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1) | |
46 | - | |
47 | -def plot_summary(precision,f05,mcc,log_file): | |
48 | - g = Gnuplot.Gnuplot() | |
49 | - g('set style data lines') | |
50 | - g.xlabel('Neighborhood (k)') | |
51 | - g.title("Setup: %s-size20" % (log_file.split("/")[-1])) | |
52 | - g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"), | |
53 | - Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"), | |
54 | - Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC")) | |
55 | - g.hardcopy(log_file+(".png"),terminal="png") | |
56 | - g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1) | |
57 | - | |
58 | -class ExperimentResults: | |
59 | - def __init__(self,repo_size): | |
60 | - self.repository_size = repo_size | |
61 | - self.precision = [] | |
62 | - self.recall = [] | |
63 | - self.fpr = [] | |
64 | - self.f05 = [] | |
65 | - self.mcc = [] | |
66 | - | |
67 | - def add_result(self,ranking,sample): | |
68 | - predicted = RecommendationResult(dict.fromkeys(ranking,1)) | |
69 | - real = RecommendationResult(sample) | |
70 | - evaluation = Evaluation(predicted,real,self.repository_size) | |
71 | - self.precision.append(evaluation.run(Precision())) | |
72 | - self.recall.append(evaluation.run(Recall())) | |
73 | - self.fpr.append(evaluation.run(FPR())) | |
74 | - self.f05.append(evaluation.run(F_score(0.5))) | |
75 | - self.mcc.append(evaluation.run(MCC())) | |
76 | - | |
77 | - def get_roc_point(self): | |
78 | - tpr = self.recall | |
79 | - fpr = self.fpr | |
80 | - if not tpr or not fpr: | |
81 | - return [0,0] | |
82 | - return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)] | |
83 | - | |
84 | - def get_precision_summary(self): | |
85 | - if not self.precision: return 0 | |
86 | - return sum(self.precision)/len(self.precision) | |
87 | - | |
88 | - def get_f05_summary(self): | |
89 | - if not self.f05: return 0 | |
90 | - return sum(self.f05)/len(self.f05) | |
91 | - | |
92 | - def get_mcc_summary(self): | |
93 | - if not self.mcc: return 0 | |
94 | - return sum(self.mcc)/len(self.mcc) | |
95 | - | |
96 | -if __name__ == '__main__': | |
97 | - if len(sys.argv)<3: | |
98 | - print "Usage: k-suite strategy_str sample_file" | |
99 | - exit(1) | |
100 | - threshold = 20 | |
101 | - iterations = 30 | |
102 | - neighbors = [3,5,10,50,100,150,200,300,400,500] | |
103 | - cfg = Config() | |
104 | - cfg.strategy = sys.argv[1] | |
105 | - sample_file = sys.argv[2] | |
106 | - population_sample = [] | |
107 | - with open(sample_file,'r') as f: | |
108 | - for line in f.readlines(): | |
109 | - user_id = line.strip('\n') | |
110 | - population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id)) | |
111 | - # setup dictionaries and files | |
112 | - roc_summary = {} | |
113 | - recommended = {} | |
114 | - precision_summary = {} | |
115 | - f05_summary = {} | |
116 | - mcc_summary = {} | |
117 | - sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1]) | |
118 | - if not os.path.exists(sample_dir): | |
119 | - os.makedirs(sample_dir) | |
120 | - log_file = os.path.join(sample_dir,cfg.strategy) | |
121 | - with open(log_file,'w') as f: | |
122 | - f.write("# %s\n\n" % sample_file.split('/')[-1]) | |
123 | - f.write("# strategy %s recommendation_size %d iterations %d\n\n" % | |
124 | - (cfg.strategy,threshold,iterations)) | |
125 | - f.write("# k coverage \tprecision \tf05 \tmcc\n\n") | |
126 | - | |
127 | - for k in neighbors: | |
128 | - roc_summary[k] = [] | |
129 | - recommended[k] = set() | |
130 | - precision_summary[k] = [] | |
131 | - f05_summary[k] = [] | |
132 | - mcc_summary[k] = [] | |
133 | - with open(log_file+"-k%.3d"%k,'w') as f: | |
134 | - f.write("# %s\n\n" % sample_file.split('/')[-1]) | |
135 | - f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k)) | |
136 | - f.write("# roc_point \tprecision \tf05 \tmcc\n\n") | |
137 | - | |
138 | - # main loop per user | |
139 | - for submission_file in population_sample: | |
140 | - user = PopconSystem(submission_file) | |
141 | - user.filter_pkg_profile(cfg.pkgs_filter) | |
142 | - user.maximal_pkg_profile() | |
143 | - for k in neighbors: | |
144 | - cfg.k_neighbors = k | |
145 | - rec = Recommender(cfg) | |
146 | - repo_size = rec.items_repository.get_doccount() | |
147 | - results = ExperimentResults(repo_size) | |
148 | - # n iterations for same recommender and user | |
149 | - for n in range(iterations): | |
150 | - # Fill sample profile | |
151 | - profile_len = len(user.pkg_profile) | |
152 | - item_score = {} | |
153 | - for pkg in user.pkg_profile: | |
154 | - item_score[pkg] = user.item_score[pkg] | |
155 | - sample = {} | |
156 | - sample_size = int(profile_len*0.9) | |
157 | - for i in range(sample_size): | |
158 | - key = random.choice(item_score.keys()) | |
159 | - sample[key] = item_score.pop(key) | |
160 | - iteration_user = User(item_score) | |
161 | - recommendation = rec.get_recommendation(iteration_user,threshold) | |
162 | - if hasattr(recommendation,"ranking"): | |
163 | - results.add_result(recommendation.ranking,sample) | |
164 | - recommended[k] = recommended[k].union(recommendation.ranking) | |
165 | - # save summary | |
166 | - roc_point = results.get_roc_point() | |
167 | - roc_summary[k].append(roc_point) | |
168 | - precision = results.get_precision_summary() | |
169 | - precision_summary[k].append(precision) | |
170 | - f05 = results.get_f05_summary() | |
171 | - f05_summary[k].append(f05) | |
172 | - mcc = results.get_mcc_summary() | |
173 | - mcc_summary[k].append(mcc) | |
174 | - with open(log_file+"-k%.3d"%k,'a') as f: | |
175 | - f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" % | |
176 | - (roc_point[0],roc_point[1],precision,f05,mcc)) | |
177 | - # back to main flow | |
178 | - with open(log_file,'a') as f: | |
179 | - plot_summary(precision_summary,f05_summary,mcc_summary,log_file) | |
180 | - for k in neighbors: | |
181 | - coverage = len(recommended[size])/float(repo_size) | |
182 | - f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" % | |
183 | - (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]), | |
184 | - float(sum(f05_summary[k]))/len(f05_summary[k]), | |
185 | - float(sum(mcc_summary[k]))/len(mcc_summary[k]))) | |
186 | - plot_roc(k,roc_summary[k],log_file) |
src/experiments/roc-suite.py
... | ... | @@ -1,231 +0,0 @@ |
1 | -#!/usr/bin/env python | |
2 | -""" | |
3 | - recommender suite - recommender experiments suite | |
4 | -""" | |
5 | -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | -__license__ = """ | |
8 | - This program is free software: you can redistribute it and/or modify | |
9 | - it under the terms of the GNU General Public License as published by | |
10 | - the Free Software Foundation, either version 3 of the License, or | |
11 | - (at your option) any later version. | |
12 | - | |
13 | - This program is distributed in the hope that it will be useful, | |
14 | - but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | - GNU General Public License for more details. | |
17 | - | |
18 | - You should have received a copy of the GNU General Public License | |
19 | - along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | -""" | |
21 | - | |
22 | -import sys | |
23 | -sys.path.insert(0,'../') | |
24 | -from config import Config | |
25 | -from data import PopconXapianIndex, PopconSubmission | |
26 | -from recommender import Recommender | |
27 | -from user import LocalSystem, User | |
28 | -from evaluation import * | |
29 | -import logging | |
30 | -import random | |
31 | -import Gnuplot | |
32 | -import numpy | |
33 | - | |
34 | -#iterations = 3 | |
35 | -#sample_proportions = [0.9] | |
36 | -#weighting = [('bm25',1.2)] | |
37 | -#collaborative = ['knn_eset'] | |
38 | -#content_based = ['cb'] | |
39 | -#hybrid = ['knnco'] | |
40 | -#profile_size = [50,100] | |
41 | -#popcon_size = ["1000"] | |
42 | -#neighbors = [50] | |
43 | - | |
44 | -iterations = 30 | |
45 | -sample_proportions = [0.9] | |
46 | -weighting = [('bm25',1.0)] | |
47 | -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset'] | |
48 | -collaborative = ['knn_eset','knn','knn_plus'] | |
49 | -hybrid = ['knnco','knnco_eset'] | |
50 | -profile_size = range(20,200,40) | |
51 | -neighbors = range(10,510,50) | |
52 | - | |
53 | -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file): | |
54 | - # Write recall log | |
55 | - output = open(("%s-%.2d" % (log_file,n)),'w') | |
56 | - output.write("# %s-n\n" % label["description"]) | |
57 | - output.write("# %s-%.2d\n" % (label["values"],n)) | |
58 | - output.write("\n# repository profile sample\n%d %d %d\n" % \ | |
59 | - (repo_size,profile_size,len(sample))) | |
60 | - if hasattr(recommendation,"ranking"): | |
61 | - notfound = [] | |
62 | - ranks = [] | |
63 | - for pkg in sample.keys(): | |
64 | - if pkg in recommendation.ranking: | |
65 | - ranks.append(recommendation.ranking.index(pkg)) | |
66 | - else: | |
67 | - notfound.append(pkg) | |
68 | - for r in sorted(ranks): | |
69 | - output.write(str(r)+"\n") | |
70 | - if notfound: | |
71 | - output.write("# out of recommendation:\n") | |
72 | - for pkg in notfound: | |
73 | - output.write(pkg+"\n") | |
74 | - output.close() | |
75 | - | |
76 | -def plot_roc(roc_points,eauc,c,p,log_file): | |
77 | - g = Gnuplot.Gnuplot() | |
78 | - g('set style data lines') | |
79 | - g.xlabel('False Positive Rate') | |
80 | - g.ylabel('True Positive Rate') | |
81 | - g('set xrange [0:1.0]') | |
82 | - g('set yrange [0:1.0]') | |
83 | - g.title("Setup: %s" % log_file.split("/")[-1]) | |
84 | - g('set label "C %.2f" at 0.8,0.25' % c) | |
85 | - g('set label "P(20) %.2f" at 0.8,0.2' % p) | |
86 | - g('set label "AUC %.4f" at 0.8,0.15' % eauc) | |
87 | - g.plot(Gnuplot.Data(roc_points,title="ROC"), | |
88 | - Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7")) | |
89 | - #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6")) | |
90 | - g.hardcopy(log_file+"-roc.png",terminal="png") | |
91 | - g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1) | |
92 | - | |
93 | -def get_label(cfg,sample_proportion): | |
94 | - label = {} | |
95 | - if cfg.strategy in content_based: | |
96 | - label["description"] = "strategy-profile" | |
97 | - label["values"] = ("%s-profile%.3d" % | |
98 | - (cfg.strategy,cfg.profile_size)) | |
99 | - elif cfg.strategy in collaborative: | |
100 | - label["description"] = "strategy-knn" | |
101 | - label["values"] = ("%s-k%.3d" % | |
102 | - (cfg.strategy,cfg.k_neighbors)) | |
103 | - elif cfg.strategy in hybrid: | |
104 | - label["description"] = "strategy-knn-profile" | |
105 | - label["values"] = ("%s-k%.3d-profile%.3d" % | |
106 | - (cfg.strategy,cfg.k_neighbors,cfg.profile_size)) | |
107 | - else: | |
108 | - print "Unknown strategy" | |
109 | - return label | |
110 | - | |
111 | -class ExperimentResults: | |
112 | - def __init__(self,repo_size): | |
113 | - self.repository_size = repo_size | |
114 | - self.precision = {} | |
115 | - self.recall = {} | |
116 | - self.fpr = {} | |
117 | - points = [1]+range(10,self.repository_size,10) | |
118 | - self.recommended = set() | |
119 | - for size in points: | |
120 | - self.precision[size] = [] | |
121 | - self.recall[size] = [] | |
122 | - self.fpr[size] = [] | |
123 | - | |
124 | - def add_result(self,ranking,sample): | |
125 | - self.recommended = self.recommended.union(ranking) | |
126 | - # get data only for point | |
127 | - for size in self.precision.keys(): | |
128 | - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1)) | |
129 | - real = RecommendationResult(sample) | |
130 | - evaluation = Evaluation(predicted,real,self.repository_size) | |
131 | - self.precision[size].append(evaluation.run(Precision())) | |
132 | - self.recall[size].append(evaluation.run(Recall())) | |
133 | - self.fpr[size].append(evaluation.run(FPR())) | |
134 | - | |
135 | - # Average ROC by threshold (= size of recommendation) | |
136 | - def get_roc_points(self): | |
137 | - points = [] | |
138 | - for size in self.recall.keys(): | |
139 | - tpr = self.recall[size] | |
140 | - fpr = self.fpr[size] | |
141 | - points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)]) | |
142 | - return sorted(points) | |
143 | - | |
144 | -def run_strategy(cfg,user): | |
145 | - for weight in weighting: | |
146 | - cfg.weight = weight[0] | |
147 | - cfg.bm25_k1 = weight[1] | |
148 | - rec = Recommender(cfg) | |
149 | - repo_size = rec.items_repository.get_doccount() | |
150 | - for proportion in sample_proportions: | |
151 | - results = ExperimentResults(repo_size) | |
152 | - label = get_label(cfg,proportion) | |
153 | - user_dir = ("results/roc-suite/%s" % user.user_id[:8]) | |
154 | - if not os.path.exists(user_dir): | |
155 | - os.mkdir(user_dir) | |
156 | - log_file = os.path.join(user_dir,label["values"]) | |
157 | - for n in range(iterations): | |
158 | - # Fill sample profile | |
159 | - profile_len = len(user.pkg_profile) | |
160 | - item_score = {} | |
161 | - for pkg in user.pkg_profile: | |
162 | - item_score[pkg] = user.item_score[pkg] | |
163 | - sample = {} | |
164 | - sample_size = int(profile_len*proportion) | |
165 | - for i in range(sample_size): | |
166 | - key = random.choice(item_score.keys()) | |
167 | - sample[key] = item_score.pop(key) | |
168 | - iteration_user = User(item_score) | |
169 | - recommendation = rec.get_recommendation(iteration_user,repo_size) | |
170 | - write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file) | |
171 | - if hasattr(recommendation,"ranking"): | |
172 | - results.add_result(recommendation.ranking,sample) | |
173 | - with open(log_file,'w') as f: | |
174 | - roc_points = results.get_roc_points() | |
175 | - x_coord = [p[0] for p in roc_points] | |
176 | - y_coord = [p[1] for p in roc_points] | |
177 | - auc = numpy.trapz(y=y_coord, x=x_coord) | |
178 | - eauc = (auc+ | |
179 | - numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+ | |
180 | - numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1])) | |
181 | - precision_20 = sum(results.precision[10])/len(results.precision[10]) | |
182 | - coverage = len(results.recommended)/float(repo_size) | |
183 | - f.write("# %s\n# %s\n\n" % | |
184 | - (label["description"],label["values"])) | |
185 | - f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" % | |
186 | - (coverage,precision_20,auc,eauc)) | |
187 | - plot_roc(roc_points,eauc,coverage,precision_20,log_file) | |
188 | - | |
189 | -def run_content(user,cfg): | |
190 | - for strategy in content_based: | |
191 | - cfg.strategy = strategy | |
192 | - for size in profile_size: | |
193 | - cfg.profile_size = size | |
194 | - run_strategy(cfg,user) | |
195 | - | |
196 | -def run_collaborative(user,cfg): | |
197 | - popcon_desktopapps = cfg.popcon_desktopapps | |
198 | - popcon_programs = cfg.popcon_programs | |
199 | - for strategy in collaborative: | |
200 | - cfg.strategy = strategy | |
201 | - for k in neighbors: | |
202 | - cfg.k_neighbors = k | |
203 | - run_strategy(cfg,user) | |
204 | - | |
205 | -def run_hybrid(user,cfg): | |
206 | - popcon_desktopapps = cfg.popcon_desktopapps | |
207 | - popcon_programs = cfg.popcon_programs | |
208 | - for strategy in hybrid: | |
209 | - cfg.strategy = strategy | |
210 | - for k in neighbors: | |
211 | - cfg.k_neighbors = k | |
212 | - for size in profile_size: | |
213 | - cfg.profile_size = size | |
214 | - run_strategy(cfg,user) | |
215 | - | |
216 | -if __name__ == '__main__': | |
217 | - if len(sys.argv)<2: | |
218 | - print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]" | |
219 | - exit(1) | |
220 | - | |
221 | - cfg = Config() | |
222 | - user = PopconSystem(sys.argv[1]) | |
223 | - user.filter_pkg_profile(cfg.pkgs_filter) | |
224 | - user.maximal_pkg_profile() | |
225 | - | |
226 | - if "content" in sys.argv or len(sys.argv)<3: | |
227 | - run_content(user,cfg) | |
228 | - if "collaborative" in sys.argv or len(sys.argv)<3: | |
229 | - run_collaborative(user,cfg) | |
230 | - if "hybrid" in sys.argv or len(sys.argv)<3: | |
231 | - run_hybrid(user,cfg) |
src/experiments/strategies-suite.py
... | ... | @@ -1,274 +0,0 @@ |
1 | -#!/usr/bin/env python | |
2 | -""" | |
3 | - recommender suite - recommender experiments suite | |
4 | -""" | |
5 | -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | -__license__ = """ | |
8 | - This program is free software: you can redistribute it and/or modify | |
9 | - it under the terms of the GNU General Public License as published by | |
10 | - the Free Software Foundation, either version 3 of the License, or | |
11 | - (at your option) any later version. | |
12 | - | |
13 | - This program is distributed in the hope that it will be useful, | |
14 | - but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | - GNU General Public License for more details. | |
17 | - | |
18 | - You should have received a copy of the GNU General Public License | |
19 | - along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | -""" | |
21 | - | |
22 | -import sys | |
23 | -sys.path.insert(0,'../') | |
24 | -from config import Config | |
25 | -from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex | |
26 | -from recommender import Recommender | |
27 | -from user import LocalSystem, User | |
28 | -from evaluation import * | |
29 | -import logging | |
30 | -import random | |
31 | -import Gnuplot | |
32 | - | |
33 | -#iterations = 3 | |
34 | -#sample_proportions = [0.9] | |
35 | -#weighting = [('bm25',1.2)] | |
36 | -#collaborative = ['knn'] | |
37 | -#content_based = [] | |
38 | -#hybrid = ['knnco'] | |
39 | -#profile_size = [50,100] | |
40 | -#popcon_size = ["1000"] | |
41 | -#neighbors = [50] | |
42 | - | |
43 | -iterations = 10 | |
44 | -sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9] | |
45 | -weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)] | |
46 | -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset'] | |
47 | -collaborative = ['knn_eset','knn','knn_plus'] | |
48 | -hybrid = ['knnco','knnco_eset'] | |
49 | - | |
50 | -profile_size = range(20,100,20) | |
51 | -#popcon_size = [1000,10000,50000,'full'] | |
52 | -neighbors = range(10,510,50) | |
53 | - | |
54 | -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file): | |
55 | - # Write recall log | |
56 | - output = open(("%s-%d" % (log_file,n)),'w') | |
57 | - output.write("# %s-n\n" % label["description"]) | |
58 | - output.write("# %s-%d\n" % (label["values"],n)) | |
59 | - output.write("\n%d %d %d\n" % \ | |
60 | - (repo_size,profile_size,len(sample))) | |
61 | - if hasattr(recommendation,"ranking"): | |
62 | - notfound = [] | |
63 | - ranks = [] | |
64 | - for pkg in sample.keys(): | |
65 | - if pkg in recommendation.ranking: | |
66 | - ranks.append(recommendation.ranking.index(pkg)) | |
67 | - else: | |
68 | - notfound.append(pkg) | |
69 | - for r in sorted(ranks): | |
70 | - output.write(str(r)+"\n") | |
71 | - if notfound: | |
72 | - output.write("Out of recommendation:\n") | |
73 | - for pkg in notfound: | |
74 | - output.write(pkg+"\n") | |
75 | - output.close() | |
76 | - | |
77 | -def plot_summary(precision,recall,f1,f05,accuracy,log_file): | |
78 | - # Plot metrics summary | |
79 | - g = Gnuplot.Gnuplot() | |
80 | - g('set style data lines') | |
81 | - g.xlabel('Recommendation size') | |
82 | - g.title("Setup: %s" % log_file.split("/")[-1]) | |
83 | - g.plot(Gnuplot.Data(accuracy,title="Accuracy"), | |
84 | - Gnuplot.Data(precision,title="Precision"), | |
85 | - Gnuplot.Data(recall,title="Recall"), | |
86 | - Gnuplot.Data(f1,title="F_1"), | |
87 | - Gnuplot.Data(f05,title="F_0.5")) | |
88 | - g.hardcopy(log_file+".png",terminal="png") | |
89 | - g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1) | |
90 | - g('set logscale x') | |
91 | - g('replot') | |
92 | - g.hardcopy(log_file+"-logscale.png",terminal="png") | |
93 | - g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1) | |
94 | - | |
95 | -def get_label(cfg,sample_proportion): | |
96 | - label = {} | |
97 | - if cfg.strategy in content_based: | |
98 | - label["description"] = "strategy-filter-profile-k1_bm25-sample" | |
99 | - label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" % | |
100 | - (cfg.strategy,cfg.profile_size, | |
101 | - cfg.pkgs_filter.split("/")[-1], | |
102 | - cfg.bm25_k1,sample_proportion)) | |
103 | - elif cfg.strategy in collaborative: | |
104 | - label["description"] = "strategy-knn-filter-k1_bm25-sample" | |
105 | - label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" % | |
106 | - (cfg.strategy,cfg.k_neighbors, | |
107 | - cfg.pkgs_filter.split("/")[-1], | |
108 | - cfg.bm25_k1,sample_proportion)) | |
109 | - elif cfg.strategy in hybrid: | |
110 | - label["description"] = "strategy-knn-filter-profile-k1_bm25-sample" | |
111 | - label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" % | |
112 | - (cfg.strategy,cfg.k_neighbors,cfg.profile_size, | |
113 | - cfg.pkgs_filter.split("/")[-1], | |
114 | - cfg.bm25_k1,sample_proportion)) | |
115 | - else: | |
116 | - print "Unknown strategy" | |
117 | - return label | |
118 | - | |
119 | -class ExperimentResults: | |
120 | - def __init__(self,repo_size): | |
121 | - self.repository_size = repo_size | |
122 | - self.accuracy = {} | |
123 | - self.precision = {} | |
124 | - self.recall = {} | |
125 | - self.f1 = {} | |
126 | - self.f05 = {} | |
127 | - points = [1]+range(10,200,10)+range(200,self.repository_size,100) | |
128 | - for size in points: | |
129 | - self.accuracy[size] = [] | |
130 | - self.precision[size] = [] | |
131 | - self.recall[size] = [] | |
132 | - self.f1[size] = [] | |
133 | - self.f05[size] = [] | |
134 | - | |
135 | - def add_result(self,ranking,sample): | |
136 | - for size in self.accuracy.keys(): | |
137 | - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1)) | |
138 | - real = RecommendationResult(sample) | |
139 | - evaluation = Evaluation(predicted,real,self.repository_size) | |
140 | - self.accuracy[size].append(evaluation.run(Accuracy())) | |
141 | - self.precision[size].append(evaluation.run(Precision())) | |
142 | - self.recall[size].append(evaluation.run(Recall())) | |
143 | - self.f1[size].append(evaluation.run(F_score(1))) | |
144 | - self.f05[size].append(evaluation.run(F_score(0.5))) | |
145 | - | |
146 | - def get_precision_summary(self): | |
147 | - summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()] | |
148 | - return sorted(summary) | |
149 | - | |
150 | - def get_recall_summary(self): | |
151 | - summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()] | |
152 | - return sorted(summary) | |
153 | - | |
154 | - def get_f1_summary(self): | |
155 | - summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()] | |
156 | - return sorted(summary) | |
157 | - | |
158 | - def get_f05_summary(self): | |
159 | - summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()] | |
160 | - return sorted(summary) | |
161 | - | |
162 | - def get_accuracy_summary(self): | |
163 | - summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()] | |
164 | - return sorted(summary) | |
165 | - | |
166 | - def best_precision(self): | |
167 | - size = max(self.precision, key = lambda x: max(self.precision[x])) | |
168 | - return (size,max(self.precision[size])) | |
169 | - | |
170 | - def best_f1(self): | |
171 | - size = max(self.f1, key = lambda x: max(self.f1[x])) | |
172 | - return (size,max(self.f1[size])) | |
173 | - | |
174 | - def best_f05(self): | |
175 | - size = max(self.f05, key = lambda x: max(self.f05[x])) | |
176 | - return (size,max(self.f05[size])) | |
177 | - | |
178 | -def run_strategy(cfg,user): | |
179 | - for weight in weighting: | |
180 | - cfg.weight = weight[0] | |
181 | - cfg.bm25_k1 = weight[1] | |
182 | - rec = Recommender(cfg) | |
183 | - repo_size = rec.items_repository.get_doccount() | |
184 | - for proportion in sample_proportions: | |
185 | - results = ExperimentResults(repo_size) | |
186 | - label = get_label(cfg,proportion) | |
187 | - log_file = "results/strategies/"+label["values"] | |
188 | - for n in range(iterations): | |
189 | - # Fill sample profile | |
190 | - profile_size = len(user.pkg_profile) | |
191 | - item_score = {} | |
192 | - for pkg in user.pkg_profile: | |
193 | - item_score[pkg] = user.item_score[pkg] | |
194 | - sample = {} | |
195 | - sample_size = int(profile_size*proportion) | |
196 | - for i in range(sample_size): | |
197 | - key = random.choice(item_score.keys()) | |
198 | - sample[key] = item_score.pop(key) | |
199 | - iteration_user = User(item_score) | |
200 | - recommendation = rec.get_recommendation(iteration_user,repo_size) | |
201 | - write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file) | |
202 | - if hasattr(recommendation,"ranking"): | |
203 | - results.add_result(recommendation.ranking,sample) | |
204 | - with open(log_file,'w') as f: | |
205 | - precision_10 = sum(results.precision[10])/len(results.precision[10]) | |
206 | - f1_10 = sum(results.f1[10])/len(results.f1[10]) | |
207 | - f05_10 = sum(results.f05[10])/len(results.f05[10]) | |
208 | - f.write("# %s\n# %s\n\ncoverage %d\n\n" % | |
209 | - (label["description"],label["values"],recommendation.size)) | |
210 | - f.write("# best results (recommendation size; metric)\n") | |
211 | - f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" % | |
212 | - (results.best_precision()[0],results.best_precision()[1], | |
213 | - results.best_f1()[0],results.best_f1()[1], | |
214 | - results.best_f05()[0],results.best_f05()[1])) | |
215 | - f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" % | |
216 | - (precision_10,f1_10,f05_10)) | |
217 | - precision = results.get_precision_summary() | |
218 | - recall = results.get_recall_summary() | |
219 | - f1 = results.get_f1_summary() | |
220 | - f05 = results.get_f05_summary() | |
221 | - accuracy = results.get_accuracy_summary() | |
222 | - plot_summary(precision,recall,f1,f05,accuracy,log_file) | |
223 | - | |
224 | -def run_content(user,cfg): | |
225 | - for strategy in content_based: | |
226 | - cfg.strategy = strategy | |
227 | - for size in profile_size: | |
228 | - cfg.profile_size = size | |
229 | - run_strategy(cfg,user) | |
230 | - | |
231 | -def run_collaborative(user,cfg): | |
232 | - popcon_desktopapps = cfg.popcon_desktopapps | |
233 | - popcon_programs = cfg.popcon_programs | |
234 | - for strategy in collaborative: | |
235 | - cfg.strategy = strategy | |
236 | - for k in neighbors: | |
237 | - cfg.k_neighbors = k | |
238 | - #for size in popcon_size: | |
239 | - # if size: | |
240 | - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size | |
241 | - # cfg.popcon_programs = popcon_programs+"_"+size | |
242 | - run_strategy(cfg,user) | |
243 | - | |
244 | -def run_hybrid(user,cfg): | |
245 | - popcon_desktopapps = cfg.popcon_desktopapps | |
246 | - popcon_programs = cfg.popcon_programs | |
247 | - for strategy in hybrid: | |
248 | - cfg.strategy = strategy | |
249 | - for k in neighbors: | |
250 | - cfg.k_neighbors = k | |
251 | - #for size in popcon_size: | |
252 | - # if size: | |
253 | - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size | |
254 | - # cfg.popcon_programs = popcon_programs+"_"+size | |
255 | - for size in profile_size: | |
256 | - cfg.profile_size = size | |
257 | - run_strategy(cfg,user) | |
258 | - | |
259 | -if __name__ == '__main__': | |
260 | - #user = LocalSystem() | |
261 | - #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps")) | |
262 | - | |
263 | - cfg = Config() | |
264 | - user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7") | |
265 | - #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623") | |
266 | - user.filter_pkg_profile(cfg.pkgs_filter) | |
267 | - user.maximal_pkg_profile() | |
268 | - | |
269 | - if "content" in sys.argv or len(sys.argv)<2: | |
270 | - run_content(user,cfg) | |
271 | - if "collaborative" in sys.argv or len(sys.argv)<2: | |
272 | - run_collaborative(user,cfg) | |
273 | - if "hybrid" in sys.argv or len(sys.argv)<2: | |
274 | - run_hybrid(user,cfg) |