Commit ccd4ef5568d421a430f37975baff0bacf775b91c

Authored by Tássia Camões Araújo
1 parent e2be2c33
Exists in master and in 1 other branch add_vagrant

Renamed files.

src/experiments/k-suite.py
... ... @@ -1,186 +0,0 @@
1   -#!/usr/bin/env python
2   -"""
3   - k-suite - experiment different neighborhood sizes
4   -"""
5   -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6   -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7   -__license__ = """
8   - This program is free software: you can redistribute it and/or modify
9   - it under the terms of the GNU General Public License as published by
10   - the Free Software Foundation, either version 3 of the License, or
11   - (at your option) any later version.
12   -
13   - This program is distributed in the hope that it will be useful,
14   - but WITHOUT ANY WARRANTY; without even the implied warranty of
15   - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16   - GNU General Public License for more details.
17   -
18   - You should have received a copy of the GNU General Public License
19   - along with this program. If not, see <http://www.gnu.org/licenses/>.
20   -"""
21   -
22   -import sys
23   -sys.path.insert(0,'../')
24   -from config import Config
25   -from data import PopconXapianIndex, PopconSubmission
26   -from recommender import Recommender
27   -from user import LocalSystem, User
28   -from evaluation import *
29   -import logging
30   -import random
31   -import Gnuplot
32   -import numpy
33   -
34   -def plot_roc(k,roc_points,log_file):
35   - g = Gnuplot.Gnuplot()
36   - g('set style data points')
37   - g.xlabel('False Positive Rate')
38   - g.ylabel('True Positive Rate')
39   - g('set xrange [0:1.0]')
40   - g('set yrange [0:1.0]')
41   - g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
42   - g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
43   - Gnuplot.Data(roc_points))
44   - g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
45   - g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
46   -
47   -def plot_summary(precision,f05,mcc,log_file):
48   - g = Gnuplot.Gnuplot()
49   - g('set style data lines')
50   - g.xlabel('Neighborhood (k)')
51   - g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
52   - g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
53   - Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
54   - Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
55   - g.hardcopy(log_file+(".png"),terminal="png")
56   - g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
57   -
58   -class ExperimentResults:
59   - def __init__(self,repo_size):
60   - self.repository_size = repo_size
61   - self.precision = []
62   - self.recall = []
63   - self.fpr = []
64   - self.f05 = []
65   - self.mcc = []
66   -
67   - def add_result(self,ranking,sample):
68   - predicted = RecommendationResult(dict.fromkeys(ranking,1))
69   - real = RecommendationResult(sample)
70   - evaluation = Evaluation(predicted,real,self.repository_size)
71   - self.precision.append(evaluation.run(Precision()))
72   - self.recall.append(evaluation.run(Recall()))
73   - self.fpr.append(evaluation.run(FPR()))
74   - self.f05.append(evaluation.run(F_score(0.5)))
75   - self.mcc.append(evaluation.run(MCC()))
76   -
77   - def get_roc_point(self):
78   - tpr = self.recall
79   - fpr = self.fpr
80   - if not tpr or not fpr:
81   - return [0,0]
82   - return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
83   -
84   - def get_precision_summary(self):
85   - if not self.precision: return 0
86   - return sum(self.precision)/len(self.precision)
87   -
88   - def get_f05_summary(self):
89   - if not self.f05: return 0
90   - return sum(self.f05)/len(self.f05)
91   -
92   - def get_mcc_summary(self):
93   - if not self.mcc: return 0
94   - return sum(self.mcc)/len(self.mcc)
95   -
96   -if __name__ == '__main__':
97   - if len(sys.argv)<3:
98   - print "Usage: k-suite strategy_str sample_file"
99   - exit(1)
100   - threshold = 20
101   - iterations = 30
102   - neighbors = [3,5,10,50,100,150,200,300,400,500]
103   - cfg = Config()
104   - cfg.strategy = sys.argv[1]
105   - sample_file = sys.argv[2]
106   - population_sample = []
107   - with open(sample_file,'r') as f:
108   - for line in f.readlines():
109   - user_id = line.strip('\n')
110   - population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
111   - # setup dictionaries and files
112   - roc_summary = {}
113   - recommended = {}
114   - precision_summary = {}
115   - f05_summary = {}
116   - mcc_summary = {}
117   - sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
118   - if not os.path.exists(sample_dir):
119   - os.makedirs(sample_dir)
120   - log_file = os.path.join(sample_dir,cfg.strategy)
121   - with open(log_file,'w') as f:
122   - f.write("# %s\n\n" % sample_file.split('/')[-1])
123   - f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
124   - (cfg.strategy,threshold,iterations))
125   - f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
126   -
127   - for k in neighbors:
128   - roc_summary[k] = []
129   - recommended[k] = set()
130   - precision_summary[k] = []
131   - f05_summary[k] = []
132   - mcc_summary[k] = []
133   - with open(log_file+"-k%.3d"%k,'w') as f:
134   - f.write("# %s\n\n" % sample_file.split('/')[-1])
135   - f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
136   - f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
137   -
138   - # main loop per user
139   - for submission_file in population_sample:
140   - user = PopconSystem(submission_file)
141   - user.filter_pkg_profile(cfg.pkgs_filter)
142   - user.maximal_pkg_profile()
143   - for k in neighbors:
144   - cfg.k_neighbors = k
145   - rec = Recommender(cfg)
146   - repo_size = rec.items_repository.get_doccount()
147   - results = ExperimentResults(repo_size)
148   - # n iterations for same recommender and user
149   - for n in range(iterations):
150   - # Fill sample profile
151   - profile_len = len(user.pkg_profile)
152   - item_score = {}
153   - for pkg in user.pkg_profile:
154   - item_score[pkg] = user.item_score[pkg]
155   - sample = {}
156   - sample_size = int(profile_len*0.9)
157   - for i in range(sample_size):
158   - key = random.choice(item_score.keys())
159   - sample[key] = item_score.pop(key)
160   - iteration_user = User(item_score)
161   - recommendation = rec.get_recommendation(iteration_user,threshold)
162   - if hasattr(recommendation,"ranking"):
163   - results.add_result(recommendation.ranking,sample)
164   - recommended[k] = recommended[k].union(recommendation.ranking)
165   - # save summary
166   - roc_point = results.get_roc_point()
167   - roc_summary[k].append(roc_point)
168   - precision = results.get_precision_summary()
169   - precision_summary[k].append(precision)
170   - f05 = results.get_f05_summary()
171   - f05_summary[k].append(f05)
172   - mcc = results.get_mcc_summary()
173   - mcc_summary[k].append(mcc)
174   - with open(log_file+"-k%.3d"%k,'a') as f:
175   - f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
176   - (roc_point[0],roc_point[1],precision,f05,mcc))
177   - # back to main flow
178   - with open(log_file,'a') as f:
179   - plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
180   - for k in neighbors:
181   - coverage = len(recommended[size])/float(repo_size)
182   - f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
183   - (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
184   - float(sum(f05_summary[k]))/len(f05_summary[k]),
185   - float(sum(mcc_summary[k]))/len(mcc_summary[k])))
186   - plot_roc(k,roc_summary[k],log_file)
src/experiments/roc-suite.py
... ... @@ -1,231 +0,0 @@
1   -#!/usr/bin/env python
2   -"""
3   - recommender suite - recommender experiments suite
4   -"""
5   -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6   -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7   -__license__ = """
8   - This program is free software: you can redistribute it and/or modify
9   - it under the terms of the GNU General Public License as published by
10   - the Free Software Foundation, either version 3 of the License, or
11   - (at your option) any later version.
12   -
13   - This program is distributed in the hope that it will be useful,
14   - but WITHOUT ANY WARRANTY; without even the implied warranty of
15   - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16   - GNU General Public License for more details.
17   -
18   - You should have received a copy of the GNU General Public License
19   - along with this program. If not, see <http://www.gnu.org/licenses/>.
20   -"""
21   -
22   -import sys
23   -sys.path.insert(0,'../')
24   -from config import Config
25   -from data import PopconXapianIndex, PopconSubmission
26   -from recommender import Recommender
27   -from user import LocalSystem, User
28   -from evaluation import *
29   -import logging
30   -import random
31   -import Gnuplot
32   -import numpy
33   -
34   -#iterations = 3
35   -#sample_proportions = [0.9]
36   -#weighting = [('bm25',1.2)]
37   -#collaborative = ['knn_eset']
38   -#content_based = ['cb']
39   -#hybrid = ['knnco']
40   -#profile_size = [50,100]
41   -#popcon_size = ["1000"]
42   -#neighbors = [50]
43   -
44   -iterations = 30
45   -sample_proportions = [0.9]
46   -weighting = [('bm25',1.0)]
47   -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
48   -collaborative = ['knn_eset','knn','knn_plus']
49   -hybrid = ['knnco','knnco_eset']
50   -profile_size = range(20,200,40)
51   -neighbors = range(10,510,50)
52   -
53   -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
54   - # Write recall log
55   - output = open(("%s-%.2d" % (log_file,n)),'w')
56   - output.write("# %s-n\n" % label["description"])
57   - output.write("# %s-%.2d\n" % (label["values"],n))
58   - output.write("\n# repository profile sample\n%d %d %d\n" % \
59   - (repo_size,profile_size,len(sample)))
60   - if hasattr(recommendation,"ranking"):
61   - notfound = []
62   - ranks = []
63   - for pkg in sample.keys():
64   - if pkg in recommendation.ranking:
65   - ranks.append(recommendation.ranking.index(pkg))
66   - else:
67   - notfound.append(pkg)
68   - for r in sorted(ranks):
69   - output.write(str(r)+"\n")
70   - if notfound:
71   - output.write("# out of recommendation:\n")
72   - for pkg in notfound:
73   - output.write(pkg+"\n")
74   - output.close()
75   -
76   -def plot_roc(roc_points,eauc,c,p,log_file):
77   - g = Gnuplot.Gnuplot()
78   - g('set style data lines')
79   - g.xlabel('False Positive Rate')
80   - g.ylabel('True Positive Rate')
81   - g('set xrange [0:1.0]')
82   - g('set yrange [0:1.0]')
83   - g.title("Setup: %s" % log_file.split("/")[-1])
84   - g('set label "C %.2f" at 0.8,0.25' % c)
85   - g('set label "P(20) %.2f" at 0.8,0.2' % p)
86   - g('set label "AUC %.4f" at 0.8,0.15' % eauc)
87   - g.plot(Gnuplot.Data(roc_points,title="ROC"),
88   - Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
89   - #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
90   - g.hardcopy(log_file+"-roc.png",terminal="png")
91   - g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
92   -
93   -def get_label(cfg,sample_proportion):
94   - label = {}
95   - if cfg.strategy in content_based:
96   - label["description"] = "strategy-profile"
97   - label["values"] = ("%s-profile%.3d" %
98   - (cfg.strategy,cfg.profile_size))
99   - elif cfg.strategy in collaborative:
100   - label["description"] = "strategy-knn"
101   - label["values"] = ("%s-k%.3d" %
102   - (cfg.strategy,cfg.k_neighbors))
103   - elif cfg.strategy in hybrid:
104   - label["description"] = "strategy-knn-profile"
105   - label["values"] = ("%s-k%.3d-profile%.3d" %
106   - (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
107   - else:
108   - print "Unknown strategy"
109   - return label
110   -
111   -class ExperimentResults:
112   - def __init__(self,repo_size):
113   - self.repository_size = repo_size
114   - self.precision = {}
115   - self.recall = {}
116   - self.fpr = {}
117   - points = [1]+range(10,self.repository_size,10)
118   - self.recommended = set()
119   - for size in points:
120   - self.precision[size] = []
121   - self.recall[size] = []
122   - self.fpr[size] = []
123   -
124   - def add_result(self,ranking,sample):
125   - self.recommended = self.recommended.union(ranking)
126   - # get data only for point
127   - for size in self.precision.keys():
128   - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
129   - real = RecommendationResult(sample)
130   - evaluation = Evaluation(predicted,real,self.repository_size)
131   - self.precision[size].append(evaluation.run(Precision()))
132   - self.recall[size].append(evaluation.run(Recall()))
133   - self.fpr[size].append(evaluation.run(FPR()))
134   -
135   - # Average ROC by threshold (= size of recommendation)
136   - def get_roc_points(self):
137   - points = []
138   - for size in self.recall.keys():
139   - tpr = self.recall[size]
140   - fpr = self.fpr[size]
141   - points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
142   - return sorted(points)
143   -
144   -def run_strategy(cfg,user):
145   - for weight in weighting:
146   - cfg.weight = weight[0]
147   - cfg.bm25_k1 = weight[1]
148   - rec = Recommender(cfg)
149   - repo_size = rec.items_repository.get_doccount()
150   - for proportion in sample_proportions:
151   - results = ExperimentResults(repo_size)
152   - label = get_label(cfg,proportion)
153   - user_dir = ("results/roc-suite/%s" % user.user_id[:8])
154   - if not os.path.exists(user_dir):
155   - os.mkdir(user_dir)
156   - log_file = os.path.join(user_dir,label["values"])
157   - for n in range(iterations):
158   - # Fill sample profile
159   - profile_len = len(user.pkg_profile)
160   - item_score = {}
161   - for pkg in user.pkg_profile:
162   - item_score[pkg] = user.item_score[pkg]
163   - sample = {}
164   - sample_size = int(profile_len*proportion)
165   - for i in range(sample_size):
166   - key = random.choice(item_score.keys())
167   - sample[key] = item_score.pop(key)
168   - iteration_user = User(item_score)
169   - recommendation = rec.get_recommendation(iteration_user,repo_size)
170   - write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
171   - if hasattr(recommendation,"ranking"):
172   - results.add_result(recommendation.ranking,sample)
173   - with open(log_file,'w') as f:
174   - roc_points = results.get_roc_points()
175   - x_coord = [p[0] for p in roc_points]
176   - y_coord = [p[1] for p in roc_points]
177   - auc = numpy.trapz(y=y_coord, x=x_coord)
178   - eauc = (auc+
179   - numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
180   - numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
181   - precision_20 = sum(results.precision[10])/len(results.precision[10])
182   - coverage = len(results.recommended)/float(repo_size)
183   - f.write("# %s\n# %s\n\n" %
184   - (label["description"],label["values"]))
185   - f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
186   - (coverage,precision_20,auc,eauc))
187   - plot_roc(roc_points,eauc,coverage,precision_20,log_file)
188   -
189   -def run_content(user,cfg):
190   - for strategy in content_based:
191   - cfg.strategy = strategy
192   - for size in profile_size:
193   - cfg.profile_size = size
194   - run_strategy(cfg,user)
195   -
196   -def run_collaborative(user,cfg):
197   - popcon_desktopapps = cfg.popcon_desktopapps
198   - popcon_programs = cfg.popcon_programs
199   - for strategy in collaborative:
200   - cfg.strategy = strategy
201   - for k in neighbors:
202   - cfg.k_neighbors = k
203   - run_strategy(cfg,user)
204   -
205   -def run_hybrid(user,cfg):
206   - popcon_desktopapps = cfg.popcon_desktopapps
207   - popcon_programs = cfg.popcon_programs
208   - for strategy in hybrid:
209   - cfg.strategy = strategy
210   - for k in neighbors:
211   - cfg.k_neighbors = k
212   - for size in profile_size:
213   - cfg.profile_size = size
214   - run_strategy(cfg,user)
215   -
216   -if __name__ == '__main__':
217   - if len(sys.argv)<2:
218   - print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]"
219   - exit(1)
220   -
221   - cfg = Config()
222   - user = PopconSystem(sys.argv[1])
223   - user.filter_pkg_profile(cfg.pkgs_filter)
224   - user.maximal_pkg_profile()
225   -
226   - if "content" in sys.argv or len(sys.argv)<3:
227   - run_content(user,cfg)
228   - if "collaborative" in sys.argv or len(sys.argv)<3:
229   - run_collaborative(user,cfg)
230   - if "hybrid" in sys.argv or len(sys.argv)<3:
231   - run_hybrid(user,cfg)
src/experiments/strategies-suite.py
... ... @@ -1,274 +0,0 @@
1   -#!/usr/bin/env python
2   -"""
3   - recommender suite - recommender experiments suite
4   -"""
5   -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6   -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7   -__license__ = """
8   - This program is free software: you can redistribute it and/or modify
9   - it under the terms of the GNU General Public License as published by
10   - the Free Software Foundation, either version 3 of the License, or
11   - (at your option) any later version.
12   -
13   - This program is distributed in the hope that it will be useful,
14   - but WITHOUT ANY WARRANTY; without even the implied warranty of
15   - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16   - GNU General Public License for more details.
17   -
18   - You should have received a copy of the GNU General Public License
19   - along with this program. If not, see <http://www.gnu.org/licenses/>.
20   -"""
21   -
22   -import sys
23   -sys.path.insert(0,'../')
24   -from config import Config
25   -from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
26   -from recommender import Recommender
27   -from user import LocalSystem, User
28   -from evaluation import *
29   -import logging
30   -import random
31   -import Gnuplot
32   -
33   -#iterations = 3
34   -#sample_proportions = [0.9]
35   -#weighting = [('bm25',1.2)]
36   -#collaborative = ['knn']
37   -#content_based = []
38   -#hybrid = ['knnco']
39   -#profile_size = [50,100]
40   -#popcon_size = ["1000"]
41   -#neighbors = [50]
42   -
43   -iterations = 10
44   -sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
45   -weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
46   -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
47   -collaborative = ['knn_eset','knn','knn_plus']
48   -hybrid = ['knnco','knnco_eset']
49   -
50   -profile_size = range(20,100,20)
51   -#popcon_size = [1000,10000,50000,'full']
52   -neighbors = range(10,510,50)
53   -
54   -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
55   - # Write recall log
56   - output = open(("%s-%d" % (log_file,n)),'w')
57   - output.write("# %s-n\n" % label["description"])
58   - output.write("# %s-%d\n" % (label["values"],n))
59   - output.write("\n%d %d %d\n" % \
60   - (repo_size,profile_size,len(sample)))
61   - if hasattr(recommendation,"ranking"):
62   - notfound = []
63   - ranks = []
64   - for pkg in sample.keys():
65   - if pkg in recommendation.ranking:
66   - ranks.append(recommendation.ranking.index(pkg))
67   - else:
68   - notfound.append(pkg)
69   - for r in sorted(ranks):
70   - output.write(str(r)+"\n")
71   - if notfound:
72   - output.write("Out of recommendation:\n")
73   - for pkg in notfound:
74   - output.write(pkg+"\n")
75   - output.close()
76   -
77   -def plot_summary(precision,recall,f1,f05,accuracy,log_file):
78   - # Plot metrics summary
79   - g = Gnuplot.Gnuplot()
80   - g('set style data lines')
81   - g.xlabel('Recommendation size')
82   - g.title("Setup: %s" % log_file.split("/")[-1])
83   - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
84   - Gnuplot.Data(precision,title="Precision"),
85   - Gnuplot.Data(recall,title="Recall"),
86   - Gnuplot.Data(f1,title="F_1"),
87   - Gnuplot.Data(f05,title="F_0.5"))
88   - g.hardcopy(log_file+".png",terminal="png")
89   - g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
90   - g('set logscale x')
91   - g('replot')
92   - g.hardcopy(log_file+"-logscale.png",terminal="png")
93   - g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
94   -
95   -def get_label(cfg,sample_proportion):
96   - label = {}
97   - if cfg.strategy in content_based:
98   - label["description"] = "strategy-filter-profile-k1_bm25-sample"
99   - label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
100   - (cfg.strategy,cfg.profile_size,
101   - cfg.pkgs_filter.split("/")[-1],
102   - cfg.bm25_k1,sample_proportion))
103   - elif cfg.strategy in collaborative:
104   - label["description"] = "strategy-knn-filter-k1_bm25-sample"
105   - label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
106   - (cfg.strategy,cfg.k_neighbors,
107   - cfg.pkgs_filter.split("/")[-1],
108   - cfg.bm25_k1,sample_proportion))
109   - elif cfg.strategy in hybrid:
110   - label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
111   - label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
112   - (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
113   - cfg.pkgs_filter.split("/")[-1],
114   - cfg.bm25_k1,sample_proportion))
115   - else:
116   - print "Unknown strategy"
117   - return label
118   -
119   -class ExperimentResults:
120   - def __init__(self,repo_size):
121   - self.repository_size = repo_size
122   - self.accuracy = {}
123   - self.precision = {}
124   - self.recall = {}
125   - self.f1 = {}
126   - self.f05 = {}
127   - points = [1]+range(10,200,10)+range(200,self.repository_size,100)
128   - for size in points:
129   - self.accuracy[size] = []
130   - self.precision[size] = []
131   - self.recall[size] = []
132   - self.f1[size] = []
133   - self.f05[size] = []
134   -
135   - def add_result(self,ranking,sample):
136   - for size in self.accuracy.keys():
137   - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
138   - real = RecommendationResult(sample)
139   - evaluation = Evaluation(predicted,real,self.repository_size)
140   - self.accuracy[size].append(evaluation.run(Accuracy()))
141   - self.precision[size].append(evaluation.run(Precision()))
142   - self.recall[size].append(evaluation.run(Recall()))
143   - self.f1[size].append(evaluation.run(F_score(1)))
144   - self.f05[size].append(evaluation.run(F_score(0.5)))
145   -
146   - def get_precision_summary(self):
147   - summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
148   - return sorted(summary)
149   -
150   - def get_recall_summary(self):
151   - summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
152   - return sorted(summary)
153   -
154   - def get_f1_summary(self):
155   - summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
156   - return sorted(summary)
157   -
158   - def get_f05_summary(self):
159   - summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
160   - return sorted(summary)
161   -
162   - def get_accuracy_summary(self):
163   - summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
164   - return sorted(summary)
165   -
166   - def best_precision(self):
167   - size = max(self.precision, key = lambda x: max(self.precision[x]))
168   - return (size,max(self.precision[size]))
169   -
170   - def best_f1(self):
171   - size = max(self.f1, key = lambda x: max(self.f1[x]))
172   - return (size,max(self.f1[size]))
173   -
174   - def best_f05(self):
175   - size = max(self.f05, key = lambda x: max(self.f05[x]))
176   - return (size,max(self.f05[size]))
177   -
178   -def run_strategy(cfg,user):
179   - for weight in weighting:
180   - cfg.weight = weight[0]
181   - cfg.bm25_k1 = weight[1]
182   - rec = Recommender(cfg)
183   - repo_size = rec.items_repository.get_doccount()
184   - for proportion in sample_proportions:
185   - results = ExperimentResults(repo_size)
186   - label = get_label(cfg,proportion)
187   - log_file = "results/strategies/"+label["values"]
188   - for n in range(iterations):
189   - # Fill sample profile
190   - profile_size = len(user.pkg_profile)
191   - item_score = {}
192   - for pkg in user.pkg_profile:
193   - item_score[pkg] = user.item_score[pkg]
194   - sample = {}
195   - sample_size = int(profile_size*proportion)
196   - for i in range(sample_size):
197   - key = random.choice(item_score.keys())
198   - sample[key] = item_score.pop(key)
199   - iteration_user = User(item_score)
200   - recommendation = rec.get_recommendation(iteration_user,repo_size)
201   - write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
202   - if hasattr(recommendation,"ranking"):
203   - results.add_result(recommendation.ranking,sample)
204   - with open(log_file,'w') as f:
205   - precision_10 = sum(results.precision[10])/len(results.precision[10])
206   - f1_10 = sum(results.f1[10])/len(results.f1[10])
207   - f05_10 = sum(results.f05[10])/len(results.f05[10])
208   - f.write("# %s\n# %s\n\ncoverage %d\n\n" %
209   - (label["description"],label["values"],recommendation.size))
210   - f.write("# best results (recommendation size; metric)\n")
211   - f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
212   - (results.best_precision()[0],results.best_precision()[1],
213   - results.best_f1()[0],results.best_f1()[1],
214   - results.best_f05()[0],results.best_f05()[1]))
215   - f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
216   - (precision_10,f1_10,f05_10))
217   - precision = results.get_precision_summary()
218   - recall = results.get_recall_summary()
219   - f1 = results.get_f1_summary()
220   - f05 = results.get_f05_summary()
221   - accuracy = results.get_accuracy_summary()
222   - plot_summary(precision,recall,f1,f05,accuracy,log_file)
223   -
224   -def run_content(user,cfg):
225   - for strategy in content_based:
226   - cfg.strategy = strategy
227   - for size in profile_size:
228   - cfg.profile_size = size
229   - run_strategy(cfg,user)
230   -
231   -def run_collaborative(user,cfg):
232   - popcon_desktopapps = cfg.popcon_desktopapps
233   - popcon_programs = cfg.popcon_programs
234   - for strategy in collaborative:
235   - cfg.strategy = strategy
236   - for k in neighbors:
237   - cfg.k_neighbors = k
238   - #for size in popcon_size:
239   - # if size:
240   - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
241   - # cfg.popcon_programs = popcon_programs+"_"+size
242   - run_strategy(cfg,user)
243   -
244   -def run_hybrid(user,cfg):
245   - popcon_desktopapps = cfg.popcon_desktopapps
246   - popcon_programs = cfg.popcon_programs
247   - for strategy in hybrid:
248   - cfg.strategy = strategy
249   - for k in neighbors:
250   - cfg.k_neighbors = k
251   - #for size in popcon_size:
252   - # if size:
253   - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
254   - # cfg.popcon_programs = popcon_programs+"_"+size
255   - for size in profile_size:
256   - cfg.profile_size = size
257   - run_strategy(cfg,user)
258   -
259   -if __name__ == '__main__':
260   - #user = LocalSystem()
261   - #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
262   -
263   - cfg = Config()
264   - user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
265   - #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
266   - user.filter_pkg_profile(cfg.pkgs_filter)
267   - user.maximal_pkg_profile()
268   -
269   - if "content" in sys.argv or len(sys.argv)<2:
270   - run_content(user,cfg)
271   - if "collaborative" in sys.argv or len(sys.argv)<2:
272   - run_collaborative(user,cfg)
273   - if "hybrid" in sys.argv or len(sys.argv)<2:
274   - run_hybrid(user,cfg)