Commit ccd4ef5568d421a430f37975baff0bacf775b91c
1 parent
e2be2c33
Exists in
master
and in
1 other branch
Renamed files.
Showing
3 changed files
with
0 additions
and
691 deletions
Show diff stats
src/experiments/k-suite.py
@@ -1,186 +0,0 @@ | @@ -1,186 +0,0 @@ | ||
1 | -#!/usr/bin/env python | ||
2 | -""" | ||
3 | - k-suite - experiment different neighborhood sizes | ||
4 | -""" | ||
5 | -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | -__license__ = """ | ||
8 | - This program is free software: you can redistribute it and/or modify | ||
9 | - it under the terms of the GNU General Public License as published by | ||
10 | - the Free Software Foundation, either version 3 of the License, or | ||
11 | - (at your option) any later version. | ||
12 | - | ||
13 | - This program is distributed in the hope that it will be useful, | ||
14 | - but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | - GNU General Public License for more details. | ||
17 | - | ||
18 | - You should have received a copy of the GNU General Public License | ||
19 | - along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | -""" | ||
21 | - | ||
22 | -import sys | ||
23 | -sys.path.insert(0,'../') | ||
24 | -from config import Config | ||
25 | -from data import PopconXapianIndex, PopconSubmission | ||
26 | -from recommender import Recommender | ||
27 | -from user import LocalSystem, User | ||
28 | -from evaluation import * | ||
29 | -import logging | ||
30 | -import random | ||
31 | -import Gnuplot | ||
32 | -import numpy | ||
33 | - | ||
34 | -def plot_roc(k,roc_points,log_file): | ||
35 | - g = Gnuplot.Gnuplot() | ||
36 | - g('set style data points') | ||
37 | - g.xlabel('False Positive Rate') | ||
38 | - g.ylabel('True Positive Rate') | ||
39 | - g('set xrange [0:1.0]') | ||
40 | - g('set yrange [0:1.0]') | ||
41 | - g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k)) | ||
42 | - g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"), | ||
43 | - Gnuplot.Data(roc_points)) | ||
44 | - g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png") | ||
45 | - g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1) | ||
46 | - | ||
47 | -def plot_summary(precision,f05,mcc,log_file): | ||
48 | - g = Gnuplot.Gnuplot() | ||
49 | - g('set style data lines') | ||
50 | - g.xlabel('Neighborhood (k)') | ||
51 | - g.title("Setup: %s-size20" % (log_file.split("/")[-1])) | ||
52 | - g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"), | ||
53 | - Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"), | ||
54 | - Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC")) | ||
55 | - g.hardcopy(log_file+(".png"),terminal="png") | ||
56 | - g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1) | ||
57 | - | ||
58 | -class ExperimentResults: | ||
59 | - def __init__(self,repo_size): | ||
60 | - self.repository_size = repo_size | ||
61 | - self.precision = [] | ||
62 | - self.recall = [] | ||
63 | - self.fpr = [] | ||
64 | - self.f05 = [] | ||
65 | - self.mcc = [] | ||
66 | - | ||
67 | - def add_result(self,ranking,sample): | ||
68 | - predicted = RecommendationResult(dict.fromkeys(ranking,1)) | ||
69 | - real = RecommendationResult(sample) | ||
70 | - evaluation = Evaluation(predicted,real,self.repository_size) | ||
71 | - self.precision.append(evaluation.run(Precision())) | ||
72 | - self.recall.append(evaluation.run(Recall())) | ||
73 | - self.fpr.append(evaluation.run(FPR())) | ||
74 | - self.f05.append(evaluation.run(F_score(0.5))) | ||
75 | - self.mcc.append(evaluation.run(MCC())) | ||
76 | - | ||
77 | - def get_roc_point(self): | ||
78 | - tpr = self.recall | ||
79 | - fpr = self.fpr | ||
80 | - if not tpr or not fpr: | ||
81 | - return [0,0] | ||
82 | - return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)] | ||
83 | - | ||
84 | - def get_precision_summary(self): | ||
85 | - if not self.precision: return 0 | ||
86 | - return sum(self.precision)/len(self.precision) | ||
87 | - | ||
88 | - def get_f05_summary(self): | ||
89 | - if not self.f05: return 0 | ||
90 | - return sum(self.f05)/len(self.f05) | ||
91 | - | ||
92 | - def get_mcc_summary(self): | ||
93 | - if not self.mcc: return 0 | ||
94 | - return sum(self.mcc)/len(self.mcc) | ||
95 | - | ||
96 | -if __name__ == '__main__': | ||
97 | - if len(sys.argv)<3: | ||
98 | - print "Usage: k-suite strategy_str sample_file" | ||
99 | - exit(1) | ||
100 | - threshold = 20 | ||
101 | - iterations = 30 | ||
102 | - neighbors = [3,5,10,50,100,150,200,300,400,500] | ||
103 | - cfg = Config() | ||
104 | - cfg.strategy = sys.argv[1] | ||
105 | - sample_file = sys.argv[2] | ||
106 | - population_sample = [] | ||
107 | - with open(sample_file,'r') as f: | ||
108 | - for line in f.readlines(): | ||
109 | - user_id = line.strip('\n') | ||
110 | - population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id)) | ||
111 | - # setup dictionaries and files | ||
112 | - roc_summary = {} | ||
113 | - recommended = {} | ||
114 | - precision_summary = {} | ||
115 | - f05_summary = {} | ||
116 | - mcc_summary = {} | ||
117 | - sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1]) | ||
118 | - if not os.path.exists(sample_dir): | ||
119 | - os.makedirs(sample_dir) | ||
120 | - log_file = os.path.join(sample_dir,cfg.strategy) | ||
121 | - with open(log_file,'w') as f: | ||
122 | - f.write("# %s\n\n" % sample_file.split('/')[-1]) | ||
123 | - f.write("# strategy %s recommendation_size %d iterations %d\n\n" % | ||
124 | - (cfg.strategy,threshold,iterations)) | ||
125 | - f.write("# k coverage \tprecision \tf05 \tmcc\n\n") | ||
126 | - | ||
127 | - for k in neighbors: | ||
128 | - roc_summary[k] = [] | ||
129 | - recommended[k] = set() | ||
130 | - precision_summary[k] = [] | ||
131 | - f05_summary[k] = [] | ||
132 | - mcc_summary[k] = [] | ||
133 | - with open(log_file+"-k%.3d"%k,'w') as f: | ||
134 | - f.write("# %s\n\n" % sample_file.split('/')[-1]) | ||
135 | - f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k)) | ||
136 | - f.write("# roc_point \tprecision \tf05 \tmcc\n\n") | ||
137 | - | ||
138 | - # main loop per user | ||
139 | - for submission_file in population_sample: | ||
140 | - user = PopconSystem(submission_file) | ||
141 | - user.filter_pkg_profile(cfg.pkgs_filter) | ||
142 | - user.maximal_pkg_profile() | ||
143 | - for k in neighbors: | ||
144 | - cfg.k_neighbors = k | ||
145 | - rec = Recommender(cfg) | ||
146 | - repo_size = rec.items_repository.get_doccount() | ||
147 | - results = ExperimentResults(repo_size) | ||
148 | - # n iterations for same recommender and user | ||
149 | - for n in range(iterations): | ||
150 | - # Fill sample profile | ||
151 | - profile_len = len(user.pkg_profile) | ||
152 | - item_score = {} | ||
153 | - for pkg in user.pkg_profile: | ||
154 | - item_score[pkg] = user.item_score[pkg] | ||
155 | - sample = {} | ||
156 | - sample_size = int(profile_len*0.9) | ||
157 | - for i in range(sample_size): | ||
158 | - key = random.choice(item_score.keys()) | ||
159 | - sample[key] = item_score.pop(key) | ||
160 | - iteration_user = User(item_score) | ||
161 | - recommendation = rec.get_recommendation(iteration_user,threshold) | ||
162 | - if hasattr(recommendation,"ranking"): | ||
163 | - results.add_result(recommendation.ranking,sample) | ||
164 | - recommended[k] = recommended[k].union(recommendation.ranking) | ||
165 | - # save summary | ||
166 | - roc_point = results.get_roc_point() | ||
167 | - roc_summary[k].append(roc_point) | ||
168 | - precision = results.get_precision_summary() | ||
169 | - precision_summary[k].append(precision) | ||
170 | - f05 = results.get_f05_summary() | ||
171 | - f05_summary[k].append(f05) | ||
172 | - mcc = results.get_mcc_summary() | ||
173 | - mcc_summary[k].append(mcc) | ||
174 | - with open(log_file+"-k%.3d"%k,'a') as f: | ||
175 | - f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" % | ||
176 | - (roc_point[0],roc_point[1],precision,f05,mcc)) | ||
177 | - # back to main flow | ||
178 | - with open(log_file,'a') as f: | ||
179 | - plot_summary(precision_summary,f05_summary,mcc_summary,log_file) | ||
180 | - for k in neighbors: | ||
181 | - coverage = len(recommended[size])/float(repo_size) | ||
182 | - f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" % | ||
183 | - (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]), | ||
184 | - float(sum(f05_summary[k]))/len(f05_summary[k]), | ||
185 | - float(sum(mcc_summary[k]))/len(mcc_summary[k]))) | ||
186 | - plot_roc(k,roc_summary[k],log_file) |
src/experiments/roc-suite.py
@@ -1,231 +0,0 @@ | @@ -1,231 +0,0 @@ | ||
1 | -#!/usr/bin/env python | ||
2 | -""" | ||
3 | - recommender suite - recommender experiments suite | ||
4 | -""" | ||
5 | -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | -__license__ = """ | ||
8 | - This program is free software: you can redistribute it and/or modify | ||
9 | - it under the terms of the GNU General Public License as published by | ||
10 | - the Free Software Foundation, either version 3 of the License, or | ||
11 | - (at your option) any later version. | ||
12 | - | ||
13 | - This program is distributed in the hope that it will be useful, | ||
14 | - but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | - GNU General Public License for more details. | ||
17 | - | ||
18 | - You should have received a copy of the GNU General Public License | ||
19 | - along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | -""" | ||
21 | - | ||
22 | -import sys | ||
23 | -sys.path.insert(0,'../') | ||
24 | -from config import Config | ||
25 | -from data import PopconXapianIndex, PopconSubmission | ||
26 | -from recommender import Recommender | ||
27 | -from user import LocalSystem, User | ||
28 | -from evaluation import * | ||
29 | -import logging | ||
30 | -import random | ||
31 | -import Gnuplot | ||
32 | -import numpy | ||
33 | - | ||
34 | -#iterations = 3 | ||
35 | -#sample_proportions = [0.9] | ||
36 | -#weighting = [('bm25',1.2)] | ||
37 | -#collaborative = ['knn_eset'] | ||
38 | -#content_based = ['cb'] | ||
39 | -#hybrid = ['knnco'] | ||
40 | -#profile_size = [50,100] | ||
41 | -#popcon_size = ["1000"] | ||
42 | -#neighbors = [50] | ||
43 | - | ||
44 | -iterations = 30 | ||
45 | -sample_proportions = [0.9] | ||
46 | -weighting = [('bm25',1.0)] | ||
47 | -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset'] | ||
48 | -collaborative = ['knn_eset','knn','knn_plus'] | ||
49 | -hybrid = ['knnco','knnco_eset'] | ||
50 | -profile_size = range(20,200,40) | ||
51 | -neighbors = range(10,510,50) | ||
52 | - | ||
53 | -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file): | ||
54 | - # Write recall log | ||
55 | - output = open(("%s-%.2d" % (log_file,n)),'w') | ||
56 | - output.write("# %s-n\n" % label["description"]) | ||
57 | - output.write("# %s-%.2d\n" % (label["values"],n)) | ||
58 | - output.write("\n# repository profile sample\n%d %d %d\n" % \ | ||
59 | - (repo_size,profile_size,len(sample))) | ||
60 | - if hasattr(recommendation,"ranking"): | ||
61 | - notfound = [] | ||
62 | - ranks = [] | ||
63 | - for pkg in sample.keys(): | ||
64 | - if pkg in recommendation.ranking: | ||
65 | - ranks.append(recommendation.ranking.index(pkg)) | ||
66 | - else: | ||
67 | - notfound.append(pkg) | ||
68 | - for r in sorted(ranks): | ||
69 | - output.write(str(r)+"\n") | ||
70 | - if notfound: | ||
71 | - output.write("# out of recommendation:\n") | ||
72 | - for pkg in notfound: | ||
73 | - output.write(pkg+"\n") | ||
74 | - output.close() | ||
75 | - | ||
76 | -def plot_roc(roc_points,eauc,c,p,log_file): | ||
77 | - g = Gnuplot.Gnuplot() | ||
78 | - g('set style data lines') | ||
79 | - g.xlabel('False Positive Rate') | ||
80 | - g.ylabel('True Positive Rate') | ||
81 | - g('set xrange [0:1.0]') | ||
82 | - g('set yrange [0:1.0]') | ||
83 | - g.title("Setup: %s" % log_file.split("/")[-1]) | ||
84 | - g('set label "C %.2f" at 0.8,0.25' % c) | ||
85 | - g('set label "P(20) %.2f" at 0.8,0.2' % p) | ||
86 | - g('set label "AUC %.4f" at 0.8,0.15' % eauc) | ||
87 | - g.plot(Gnuplot.Data(roc_points,title="ROC"), | ||
88 | - Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7")) | ||
89 | - #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6")) | ||
90 | - g.hardcopy(log_file+"-roc.png",terminal="png") | ||
91 | - g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1) | ||
92 | - | ||
93 | -def get_label(cfg,sample_proportion): | ||
94 | - label = {} | ||
95 | - if cfg.strategy in content_based: | ||
96 | - label["description"] = "strategy-profile" | ||
97 | - label["values"] = ("%s-profile%.3d" % | ||
98 | - (cfg.strategy,cfg.profile_size)) | ||
99 | - elif cfg.strategy in collaborative: | ||
100 | - label["description"] = "strategy-knn" | ||
101 | - label["values"] = ("%s-k%.3d" % | ||
102 | - (cfg.strategy,cfg.k_neighbors)) | ||
103 | - elif cfg.strategy in hybrid: | ||
104 | - label["description"] = "strategy-knn-profile" | ||
105 | - label["values"] = ("%s-k%.3d-profile%.3d" % | ||
106 | - (cfg.strategy,cfg.k_neighbors,cfg.profile_size)) | ||
107 | - else: | ||
108 | - print "Unknown strategy" | ||
109 | - return label | ||
110 | - | ||
111 | -class ExperimentResults: | ||
112 | - def __init__(self,repo_size): | ||
113 | - self.repository_size = repo_size | ||
114 | - self.precision = {} | ||
115 | - self.recall = {} | ||
116 | - self.fpr = {} | ||
117 | - points = [1]+range(10,self.repository_size,10) | ||
118 | - self.recommended = set() | ||
119 | - for size in points: | ||
120 | - self.precision[size] = [] | ||
121 | - self.recall[size] = [] | ||
122 | - self.fpr[size] = [] | ||
123 | - | ||
124 | - def add_result(self,ranking,sample): | ||
125 | - self.recommended = self.recommended.union(ranking) | ||
126 | - # get data only for point | ||
127 | - for size in self.precision.keys(): | ||
128 | - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1)) | ||
129 | - real = RecommendationResult(sample) | ||
130 | - evaluation = Evaluation(predicted,real,self.repository_size) | ||
131 | - self.precision[size].append(evaluation.run(Precision())) | ||
132 | - self.recall[size].append(evaluation.run(Recall())) | ||
133 | - self.fpr[size].append(evaluation.run(FPR())) | ||
134 | - | ||
135 | - # Average ROC by threshold (= size of recommendation) | ||
136 | - def get_roc_points(self): | ||
137 | - points = [] | ||
138 | - for size in self.recall.keys(): | ||
139 | - tpr = self.recall[size] | ||
140 | - fpr = self.fpr[size] | ||
141 | - points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)]) | ||
142 | - return sorted(points) | ||
143 | - | ||
144 | -def run_strategy(cfg,user): | ||
145 | - for weight in weighting: | ||
146 | - cfg.weight = weight[0] | ||
147 | - cfg.bm25_k1 = weight[1] | ||
148 | - rec = Recommender(cfg) | ||
149 | - repo_size = rec.items_repository.get_doccount() | ||
150 | - for proportion in sample_proportions: | ||
151 | - results = ExperimentResults(repo_size) | ||
152 | - label = get_label(cfg,proportion) | ||
153 | - user_dir = ("results/roc-suite/%s" % user.user_id[:8]) | ||
154 | - if not os.path.exists(user_dir): | ||
155 | - os.mkdir(user_dir) | ||
156 | - log_file = os.path.join(user_dir,label["values"]) | ||
157 | - for n in range(iterations): | ||
158 | - # Fill sample profile | ||
159 | - profile_len = len(user.pkg_profile) | ||
160 | - item_score = {} | ||
161 | - for pkg in user.pkg_profile: | ||
162 | - item_score[pkg] = user.item_score[pkg] | ||
163 | - sample = {} | ||
164 | - sample_size = int(profile_len*proportion) | ||
165 | - for i in range(sample_size): | ||
166 | - key = random.choice(item_score.keys()) | ||
167 | - sample[key] = item_score.pop(key) | ||
168 | - iteration_user = User(item_score) | ||
169 | - recommendation = rec.get_recommendation(iteration_user,repo_size) | ||
170 | - write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file) | ||
171 | - if hasattr(recommendation,"ranking"): | ||
172 | - results.add_result(recommendation.ranking,sample) | ||
173 | - with open(log_file,'w') as f: | ||
174 | - roc_points = results.get_roc_points() | ||
175 | - x_coord = [p[0] for p in roc_points] | ||
176 | - y_coord = [p[1] for p in roc_points] | ||
177 | - auc = numpy.trapz(y=y_coord, x=x_coord) | ||
178 | - eauc = (auc+ | ||
179 | - numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+ | ||
180 | - numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1])) | ||
181 | - precision_20 = sum(results.precision[10])/len(results.precision[10]) | ||
182 | - coverage = len(results.recommended)/float(repo_size) | ||
183 | - f.write("# %s\n# %s\n\n" % | ||
184 | - (label["description"],label["values"])) | ||
185 | - f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" % | ||
186 | - (coverage,precision_20,auc,eauc)) | ||
187 | - plot_roc(roc_points,eauc,coverage,precision_20,log_file) | ||
188 | - | ||
189 | -def run_content(user,cfg): | ||
190 | - for strategy in content_based: | ||
191 | - cfg.strategy = strategy | ||
192 | - for size in profile_size: | ||
193 | - cfg.profile_size = size | ||
194 | - run_strategy(cfg,user) | ||
195 | - | ||
196 | -def run_collaborative(user,cfg): | ||
197 | - popcon_desktopapps = cfg.popcon_desktopapps | ||
198 | - popcon_programs = cfg.popcon_programs | ||
199 | - for strategy in collaborative: | ||
200 | - cfg.strategy = strategy | ||
201 | - for k in neighbors: | ||
202 | - cfg.k_neighbors = k | ||
203 | - run_strategy(cfg,user) | ||
204 | - | ||
205 | -def run_hybrid(user,cfg): | ||
206 | - popcon_desktopapps = cfg.popcon_desktopapps | ||
207 | - popcon_programs = cfg.popcon_programs | ||
208 | - for strategy in hybrid: | ||
209 | - cfg.strategy = strategy | ||
210 | - for k in neighbors: | ||
211 | - cfg.k_neighbors = k | ||
212 | - for size in profile_size: | ||
213 | - cfg.profile_size = size | ||
214 | - run_strategy(cfg,user) | ||
215 | - | ||
216 | -if __name__ == '__main__': | ||
217 | - if len(sys.argv)<2: | ||
218 | - print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]" | ||
219 | - exit(1) | ||
220 | - | ||
221 | - cfg = Config() | ||
222 | - user = PopconSystem(sys.argv[1]) | ||
223 | - user.filter_pkg_profile(cfg.pkgs_filter) | ||
224 | - user.maximal_pkg_profile() | ||
225 | - | ||
226 | - if "content" in sys.argv or len(sys.argv)<3: | ||
227 | - run_content(user,cfg) | ||
228 | - if "collaborative" in sys.argv or len(sys.argv)<3: | ||
229 | - run_collaborative(user,cfg) | ||
230 | - if "hybrid" in sys.argv or len(sys.argv)<3: | ||
231 | - run_hybrid(user,cfg) |
src/experiments/strategies-suite.py
@@ -1,274 +0,0 @@ | @@ -1,274 +0,0 @@ | ||
1 | -#!/usr/bin/env python | ||
2 | -""" | ||
3 | - recommender suite - recommender experiments suite | ||
4 | -""" | ||
5 | -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | -__license__ = """ | ||
8 | - This program is free software: you can redistribute it and/or modify | ||
9 | - it under the terms of the GNU General Public License as published by | ||
10 | - the Free Software Foundation, either version 3 of the License, or | ||
11 | - (at your option) any later version. | ||
12 | - | ||
13 | - This program is distributed in the hope that it will be useful, | ||
14 | - but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | - GNU General Public License for more details. | ||
17 | - | ||
18 | - You should have received a copy of the GNU General Public License | ||
19 | - along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | -""" | ||
21 | - | ||
22 | -import sys | ||
23 | -sys.path.insert(0,'../') | ||
24 | -from config import Config | ||
25 | -from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex | ||
26 | -from recommender import Recommender | ||
27 | -from user import LocalSystem, User | ||
28 | -from evaluation import * | ||
29 | -import logging | ||
30 | -import random | ||
31 | -import Gnuplot | ||
32 | - | ||
33 | -#iterations = 3 | ||
34 | -#sample_proportions = [0.9] | ||
35 | -#weighting = [('bm25',1.2)] | ||
36 | -#collaborative = ['knn'] | ||
37 | -#content_based = [] | ||
38 | -#hybrid = ['knnco'] | ||
39 | -#profile_size = [50,100] | ||
40 | -#popcon_size = ["1000"] | ||
41 | -#neighbors = [50] | ||
42 | - | ||
43 | -iterations = 10 | ||
44 | -sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9] | ||
45 | -weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)] | ||
46 | -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset'] | ||
47 | -collaborative = ['knn_eset','knn','knn_plus'] | ||
48 | -hybrid = ['knnco','knnco_eset'] | ||
49 | - | ||
50 | -profile_size = range(20,100,20) | ||
51 | -#popcon_size = [1000,10000,50000,'full'] | ||
52 | -neighbors = range(10,510,50) | ||
53 | - | ||
54 | -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file): | ||
55 | - # Write recall log | ||
56 | - output = open(("%s-%d" % (log_file,n)),'w') | ||
57 | - output.write("# %s-n\n" % label["description"]) | ||
58 | - output.write("# %s-%d\n" % (label["values"],n)) | ||
59 | - output.write("\n%d %d %d\n" % \ | ||
60 | - (repo_size,profile_size,len(sample))) | ||
61 | - if hasattr(recommendation,"ranking"): | ||
62 | - notfound = [] | ||
63 | - ranks = [] | ||
64 | - for pkg in sample.keys(): | ||
65 | - if pkg in recommendation.ranking: | ||
66 | - ranks.append(recommendation.ranking.index(pkg)) | ||
67 | - else: | ||
68 | - notfound.append(pkg) | ||
69 | - for r in sorted(ranks): | ||
70 | - output.write(str(r)+"\n") | ||
71 | - if notfound: | ||
72 | - output.write("Out of recommendation:\n") | ||
73 | - for pkg in notfound: | ||
74 | - output.write(pkg+"\n") | ||
75 | - output.close() | ||
76 | - | ||
77 | -def plot_summary(precision,recall,f1,f05,accuracy,log_file): | ||
78 | - # Plot metrics summary | ||
79 | - g = Gnuplot.Gnuplot() | ||
80 | - g('set style data lines') | ||
81 | - g.xlabel('Recommendation size') | ||
82 | - g.title("Setup: %s" % log_file.split("/")[-1]) | ||
83 | - g.plot(Gnuplot.Data(accuracy,title="Accuracy"), | ||
84 | - Gnuplot.Data(precision,title="Precision"), | ||
85 | - Gnuplot.Data(recall,title="Recall"), | ||
86 | - Gnuplot.Data(f1,title="F_1"), | ||
87 | - Gnuplot.Data(f05,title="F_0.5")) | ||
88 | - g.hardcopy(log_file+".png",terminal="png") | ||
89 | - g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1) | ||
90 | - g('set logscale x') | ||
91 | - g('replot') | ||
92 | - g.hardcopy(log_file+"-logscale.png",terminal="png") | ||
93 | - g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1) | ||
94 | - | ||
95 | -def get_label(cfg,sample_proportion): | ||
96 | - label = {} | ||
97 | - if cfg.strategy in content_based: | ||
98 | - label["description"] = "strategy-filter-profile-k1_bm25-sample" | ||
99 | - label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" % | ||
100 | - (cfg.strategy,cfg.profile_size, | ||
101 | - cfg.pkgs_filter.split("/")[-1], | ||
102 | - cfg.bm25_k1,sample_proportion)) | ||
103 | - elif cfg.strategy in collaborative: | ||
104 | - label["description"] = "strategy-knn-filter-k1_bm25-sample" | ||
105 | - label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" % | ||
106 | - (cfg.strategy,cfg.k_neighbors, | ||
107 | - cfg.pkgs_filter.split("/")[-1], | ||
108 | - cfg.bm25_k1,sample_proportion)) | ||
109 | - elif cfg.strategy in hybrid: | ||
110 | - label["description"] = "strategy-knn-filter-profile-k1_bm25-sample" | ||
111 | - label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" % | ||
112 | - (cfg.strategy,cfg.k_neighbors,cfg.profile_size, | ||
113 | - cfg.pkgs_filter.split("/")[-1], | ||
114 | - cfg.bm25_k1,sample_proportion)) | ||
115 | - else: | ||
116 | - print "Unknown strategy" | ||
117 | - return label | ||
118 | - | ||
119 | -class ExperimentResults: | ||
120 | - def __init__(self,repo_size): | ||
121 | - self.repository_size = repo_size | ||
122 | - self.accuracy = {} | ||
123 | - self.precision = {} | ||
124 | - self.recall = {} | ||
125 | - self.f1 = {} | ||
126 | - self.f05 = {} | ||
127 | - points = [1]+range(10,200,10)+range(200,self.repository_size,100) | ||
128 | - for size in points: | ||
129 | - self.accuracy[size] = [] | ||
130 | - self.precision[size] = [] | ||
131 | - self.recall[size] = [] | ||
132 | - self.f1[size] = [] | ||
133 | - self.f05[size] = [] | ||
134 | - | ||
135 | - def add_result(self,ranking,sample): | ||
136 | - for size in self.accuracy.keys(): | ||
137 | - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1)) | ||
138 | - real = RecommendationResult(sample) | ||
139 | - evaluation = Evaluation(predicted,real,self.repository_size) | ||
140 | - self.accuracy[size].append(evaluation.run(Accuracy())) | ||
141 | - self.precision[size].append(evaluation.run(Precision())) | ||
142 | - self.recall[size].append(evaluation.run(Recall())) | ||
143 | - self.f1[size].append(evaluation.run(F_score(1))) | ||
144 | - self.f05[size].append(evaluation.run(F_score(0.5))) | ||
145 | - | ||
146 | - def get_precision_summary(self): | ||
147 | - summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()] | ||
148 | - return sorted(summary) | ||
149 | - | ||
150 | - def get_recall_summary(self): | ||
151 | - summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()] | ||
152 | - return sorted(summary) | ||
153 | - | ||
154 | - def get_f1_summary(self): | ||
155 | - summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()] | ||
156 | - return sorted(summary) | ||
157 | - | ||
158 | - def get_f05_summary(self): | ||
159 | - summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()] | ||
160 | - return sorted(summary) | ||
161 | - | ||
162 | - def get_accuracy_summary(self): | ||
163 | - summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()] | ||
164 | - return sorted(summary) | ||
165 | - | ||
166 | - def best_precision(self): | ||
167 | - size = max(self.precision, key = lambda x: max(self.precision[x])) | ||
168 | - return (size,max(self.precision[size])) | ||
169 | - | ||
170 | - def best_f1(self): | ||
171 | - size = max(self.f1, key = lambda x: max(self.f1[x])) | ||
172 | - return (size,max(self.f1[size])) | ||
173 | - | ||
174 | - def best_f05(self): | ||
175 | - size = max(self.f05, key = lambda x: max(self.f05[x])) | ||
176 | - return (size,max(self.f05[size])) | ||
177 | - | ||
178 | -def run_strategy(cfg,user): | ||
179 | - for weight in weighting: | ||
180 | - cfg.weight = weight[0] | ||
181 | - cfg.bm25_k1 = weight[1] | ||
182 | - rec = Recommender(cfg) | ||
183 | - repo_size = rec.items_repository.get_doccount() | ||
184 | - for proportion in sample_proportions: | ||
185 | - results = ExperimentResults(repo_size) | ||
186 | - label = get_label(cfg,proportion) | ||
187 | - log_file = "results/strategies/"+label["values"] | ||
188 | - for n in range(iterations): | ||
189 | - # Fill sample profile | ||
190 | - profile_size = len(user.pkg_profile) | ||
191 | - item_score = {} | ||
192 | - for pkg in user.pkg_profile: | ||
193 | - item_score[pkg] = user.item_score[pkg] | ||
194 | - sample = {} | ||
195 | - sample_size = int(profile_size*proportion) | ||
196 | - for i in range(sample_size): | ||
197 | - key = random.choice(item_score.keys()) | ||
198 | - sample[key] = item_score.pop(key) | ||
199 | - iteration_user = User(item_score) | ||
200 | - recommendation = rec.get_recommendation(iteration_user,repo_size) | ||
201 | - write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file) | ||
202 | - if hasattr(recommendation,"ranking"): | ||
203 | - results.add_result(recommendation.ranking,sample) | ||
204 | - with open(log_file,'w') as f: | ||
205 | - precision_10 = sum(results.precision[10])/len(results.precision[10]) | ||
206 | - f1_10 = sum(results.f1[10])/len(results.f1[10]) | ||
207 | - f05_10 = sum(results.f05[10])/len(results.f05[10]) | ||
208 | - f.write("# %s\n# %s\n\ncoverage %d\n\n" % | ||
209 | - (label["description"],label["values"],recommendation.size)) | ||
210 | - f.write("# best results (recommendation size; metric)\n") | ||
211 | - f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" % | ||
212 | - (results.best_precision()[0],results.best_precision()[1], | ||
213 | - results.best_f1()[0],results.best_f1()[1], | ||
214 | - results.best_f05()[0],results.best_f05()[1])) | ||
215 | - f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" % | ||
216 | - (precision_10,f1_10,f05_10)) | ||
217 | - precision = results.get_precision_summary() | ||
218 | - recall = results.get_recall_summary() | ||
219 | - f1 = results.get_f1_summary() | ||
220 | - f05 = results.get_f05_summary() | ||
221 | - accuracy = results.get_accuracy_summary() | ||
222 | - plot_summary(precision,recall,f1,f05,accuracy,log_file) | ||
223 | - | ||
224 | -def run_content(user,cfg): | ||
225 | - for strategy in content_based: | ||
226 | - cfg.strategy = strategy | ||
227 | - for size in profile_size: | ||
228 | - cfg.profile_size = size | ||
229 | - run_strategy(cfg,user) | ||
230 | - | ||
231 | -def run_collaborative(user,cfg): | ||
232 | - popcon_desktopapps = cfg.popcon_desktopapps | ||
233 | - popcon_programs = cfg.popcon_programs | ||
234 | - for strategy in collaborative: | ||
235 | - cfg.strategy = strategy | ||
236 | - for k in neighbors: | ||
237 | - cfg.k_neighbors = k | ||
238 | - #for size in popcon_size: | ||
239 | - # if size: | ||
240 | - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size | ||
241 | - # cfg.popcon_programs = popcon_programs+"_"+size | ||
242 | - run_strategy(cfg,user) | ||
243 | - | ||
244 | -def run_hybrid(user,cfg): | ||
245 | - popcon_desktopapps = cfg.popcon_desktopapps | ||
246 | - popcon_programs = cfg.popcon_programs | ||
247 | - for strategy in hybrid: | ||
248 | - cfg.strategy = strategy | ||
249 | - for k in neighbors: | ||
250 | - cfg.k_neighbors = k | ||
251 | - #for size in popcon_size: | ||
252 | - # if size: | ||
253 | - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size | ||
254 | - # cfg.popcon_programs = popcon_programs+"_"+size | ||
255 | - for size in profile_size: | ||
256 | - cfg.profile_size = size | ||
257 | - run_strategy(cfg,user) | ||
258 | - | ||
259 | -if __name__ == '__main__': | ||
260 | - #user = LocalSystem() | ||
261 | - #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps")) | ||
262 | - | ||
263 | - cfg = Config() | ||
264 | - user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7") | ||
265 | - #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623") | ||
266 | - user.filter_pkg_profile(cfg.pkgs_filter) | ||
267 | - user.maximal_pkg_profile() | ||
268 | - | ||
269 | - if "content" in sys.argv or len(sys.argv)<2: | ||
270 | - run_content(user,cfg) | ||
271 | - if "collaborative" in sys.argv or len(sys.argv)<2: | ||
272 | - run_collaborative(user,cfg) | ||
273 | - if "hybrid" in sys.argv or len(sys.argv)<2: | ||
274 | - run_hybrid(user,cfg) |