Commit 2bc5af976c764e7ca2015f128e8c0ad2ac4ffb08

Authored by Tássia Camões Araújo
1 parent 4885b0b0
Exists in master and in 1 other branch add_vagrant

Updated experiments.

Showing 1 changed file with 225 additions and 97 deletions   Show diff stats
src/experiments/strategies-suite.py
... ... @@ -30,117 +30,245 @@ import logging
30 30 import random
31 31 import Gnuplot
32 32  
33   -def write_recall_log(label,sample,recommendation,log_file):
  33 +#iterations = 3
  34 +#sample_proportions = [0.9]
  35 +#weighting = [('bm25',1.2)]
  36 +#collaborative = ['knn']
  37 +#content_based = []
  38 +#hybrid = ['knnco']
  39 +#profile_size = [50,100]
  40 +#popcon_size = ["1000"]
  41 +#neighbors = [50]
  42 +
  43 +iterations = 10
  44 +sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
  45 +weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
  46 +content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  47 +collaborative = ['knn_eset','knn','knn_plus']
  48 +hybrid = ['knnco','knnco_eset']
  49 +
  50 +profile_size = range(20,100,20)
  51 +#popcon_size = [1000,10000,50000,'full']
  52 +neighbors = range(10,510,50)
  53 +
  54 +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
34 55 # Write recall log
35   - output = open(log_file,'w')
36   - output.write("# %s\n" % label["description"])
37   - output.write("# %s\n" % label["values"])
38   - notfound = []
39   - ranks = []
40   - for pkg in sample.keys():
41   - if pkg in recommendation.ranking:
42   - ranks.append(recommendation.ranking.index(pkg))
43   - else:
44   - notfound.append(pkg)
45   - for r in sorted(ranks):
46   - output.write(str(r)+"\n")
47   - if notfound:
48   - output.write("Out of recommendation:\n")
49   - for pkg in notfound:
50   - output.write(pkg+"\n")
  56 + output = open(("%s-%d" % (log_file,n)),'w')
  57 + output.write("# %s-n\n" % label["description"])
  58 + output.write("# %s-%d\n" % (label["values"],n))
  59 + output.write("\n%d %d %d\n" % \
  60 + (repo_size,profile_size,len(sample)))
  61 + if hasattr(recommendation,"ranking"):
  62 + notfound = []
  63 + ranks = []
  64 + for pkg in sample.keys():
  65 + if pkg in recommendation.ranking:
  66 + ranks.append(recommendation.ranking.index(pkg))
  67 + else:
  68 + notfound.append(pkg)
  69 + for r in sorted(ranks):
  70 + output.write(str(r)+"\n")
  71 + if notfound:
  72 + output.write("Out of recommendation:\n")
  73 + for pkg in notfound:
  74 + output.write(pkg+"\n")
51 75 output.close()
52 76  
53   -def plot_summary(sample,recommendation,repo_size,log_file):
  77 +def plot_summary(precision,recall,f1,f05,accuracy,log_file):
54 78 # Plot metrics summary
55   - accuracy = []
56   - precision = []
57   - recall = []
58   - f1 = []
59 79 g = Gnuplot.Gnuplot()
60 80 g('set style data lines')
61 81 g.xlabel('Recommendation size')
62   - for size in range(1,len(recommendation.ranking)+1,100):
63   - predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
64   - real = RecommendationResult(sample)
65   - evaluation = Evaluation(predicted,real,repo_size)
66   - accuracy.append([size,evaluation.run(Accuracy())])
67   - precision.append([size,evaluation.run(Precision())])
68   - recall.append([size,evaluation.run(Recall())])
69   - f1.append([size,evaluation.run(F1())])
70   -
  82 + g.title("Setup: %s" % log_file.split("/")[-1])
71 83 g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
72 84 Gnuplot.Data(precision,title="Precision"),
73 85 Gnuplot.Data(recall,title="Recall"),
74   - Gnuplot.Data(f1,title="F1"))
75   - g.hardcopy(log_file+"-plot.ps", terminal="postscript")
76   - g.hardcopy(log_file+"-plot.ps", terminal="postscript")
77   -
78   -def run_iteration(user,cfg,label,sample):
79   - rec = Recommender(cfg)
80   - repo_size = rec.items_repository.get_doccount()
81   - recommendation = rec.get_recommendation(user,repo_size)
82   - log_file = "results/strategies/"+label["values"]
83   - write_recall_log(label,sample,recommendation,log_file)
84   - plot_summary(sample,recommendation,repo_size,log_file)
85   -
86   -def run_strategies(user,sample,n):
87   - cfg = Config()
  86 + Gnuplot.Data(f1,title="F_1"),
  87 + Gnuplot.Data(f05,title="F_0.5"))
  88 + g.hardcopy(log_file+".png",terminal="png")
  89 + g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
  90 + g('set logscale x')
  91 + g('replot')
  92 + g.hardcopy(log_file+"-logscale.png",terminal="png")
  93 + g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
  94 +
  95 +def get_label(cfg,sample_proportion):
88 96 label = {}
89   - sample_proportion = (len(sample)/len(user.pkg_profile)+len(sample))
90   - for k in bm25_k1:
91   - cfg.bm25_k1 = k
92   - if "content" in sys.argv or len(sys.argv)<2:
  97 + if cfg.strategy in content_based:
  98 + label["description"] = "strategy-filter-profile-k1_bm25-sample"
  99 + label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
  100 + (cfg.strategy,cfg.profile_size,
  101 + cfg.pkgs_filter.split("/")[-1],
  102 + cfg.bm25_k1,sample_proportion))
  103 + elif cfg.strategy in collaborative:
  104 + label["description"] = "strategy-knn-filter-k1_bm25-sample"
  105 + label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
  106 + (cfg.strategy,cfg.k_neighbors,
  107 + cfg.pkgs_filter.split("/")[-1],
  108 + cfg.bm25_k1,sample_proportion))
  109 + elif cfg.strategy in hybrid:
  110 + label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
  111 + label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
  112 + (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
  113 + cfg.pkgs_filter.split("/")[-1],
  114 + cfg.bm25_k1,sample_proportion))
  115 + else:
  116 + print "Unknown strategy"
  117 + return label
  118 +
  119 +class ExperimentResults:
  120 + def __init__(self,repo_size):
  121 + self.repository_size = repo_size
  122 + self.accuracy = {}
  123 + self.precision = {}
  124 + self.recall = {}
  125 + self.f1 = {}
  126 + self.f05 = {}
  127 + points = [1]+range(10,200,10)+range(200,self.repository_size,100)
  128 + for size in points:
  129 + self.accuracy[size] = []
  130 + self.precision[size] = []
  131 + self.recall[size] = []
  132 + self.f1[size] = []
  133 + self.f05[size] = []
  134 +
  135 + def add_result(self,ranking,sample):
  136 + for size in self.accuracy.keys():
  137 + predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
  138 + real = RecommendationResult(sample)
  139 + evaluation = Evaluation(predicted,real,self.repository_size)
  140 + self.accuracy[size].append(evaluation.run(Accuracy()))
  141 + self.precision[size].append(evaluation.run(Precision()))
  142 + self.recall[size].append(evaluation.run(Recall()))
  143 + self.f1[size].append(evaluation.run(F_score(1)))
  144 + self.f05[size].append(evaluation.run(F_score(0.5)))
  145 +
  146 + def get_precision_summary(self):
  147 + summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
  148 + return sorted(summary)
  149 +
  150 + def get_recall_summary(self):
  151 + summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
  152 + return sorted(summary)
  153 +
  154 + def get_f1_summary(self):
  155 + summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
  156 + return sorted(summary)
  157 +
  158 + def get_f05_summary(self):
  159 + summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
  160 + return sorted(summary)
  161 +
  162 + def get_accuracy_summary(self):
  163 + summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
  164 + return sorted(summary)
  165 +
  166 + def best_precision(self):
  167 + size = max(self.precision, key = lambda x: max(self.precision[x]))
  168 + return (size,max(self.precision[size]))
  169 +
  170 + def best_f1(self):
  171 + size = max(self.f1, key = lambda x: max(self.f1[x]))
  172 + return (size,max(self.f1[size]))
  173 +
  174 + def best_f05(self):
  175 + size = max(self.f05, key = lambda x: max(self.f05[x]))
  176 + return (size,max(self.f05[size]))
  177 +
  178 +def run_strategy(cfg,user):
  179 + for weight in weighting:
  180 + cfg.weight = weight[0]
  181 + cfg.bm25_k1 = weight[1]
  182 + rec = Recommender(cfg)
  183 + repo_size = rec.items_repository.get_doccount()
  184 + for proportion in sample_proportions:
  185 + results = ExperimentResults(repo_size)
  186 + label = get_label(cfg,proportion)
  187 + log_file = "results/strategies/"+label["values"]
  188 + for n in range(iterations):
  189 + # Fill sample profile
  190 + profile_size = len(user.pkg_profile)
  191 + item_score = {}
  192 + for pkg in user.pkg_profile:
  193 + item_score[pkg] = user.item_score[pkg]
  194 + sample = {}
  195 + sample_size = int(profile_size*proportion)
  196 + for i in range(sample_size):
  197 + key = random.choice(item_score.keys())
  198 + sample[key] = item_score.pop(key)
  199 + iteration_user = User(item_score)
  200 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  201 + write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
  202 + if hasattr(recommendation,"ranking"):
  203 + results.add_result(recommendation.ranking,sample)
  204 + with open(log_file,'w') as f:
  205 + precision_10 = sum(results.precision[10])/len(results.precision[10])
  206 + f1_10 = sum(results.f1[10])/len(results.f1[10])
  207 + f05_10 = sum(results.f05[10])/len(results.f05[10])
  208 + f.write("# %s\n# %s\n\ncoverage %d\n\n" %
  209 + (label["description"],label["values"],recommendation.size))
  210 + f.write("# best results (recommendation size; metric)\n")
  211 + f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
  212 + (results.best_precision()[0],results.best_precision()[1],
  213 + results.best_f1()[0],results.best_f1()[1],
  214 + results.best_f05()[0],results.best_f05()[1]))
  215 + f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
  216 + (precision_10,f1_10,f05_10))
  217 + precision = results.get_precision_summary()
  218 + recall = results.get_recall_summary()
  219 + f1 = results.get_f1_summary()
  220 + f05 = results.get_f05_summary()
  221 + accuracy = results.get_accuracy_summary()
  222 + plot_summary(precision,recall,f1,f05,accuracy,log_file)
  223 +
  224 +def run_content(user,cfg):
  225 + for strategy in content_based:
  226 + cfg.strategy = strategy
  227 + for size in profile_size:
  228 + cfg.profile_size = size
  229 + run_strategy(cfg,user)
  230 +
  231 +def run_collaborative(user,cfg):
  232 + popcon_desktopapps = cfg.popcon_desktopapps
  233 + popcon_programs = cfg.popcon_programs
  234 + for strategy in collaborative:
  235 + cfg.strategy = strategy
  236 + for k in neighbors:
  237 + cfg.k_neighbors = k
  238 + #for size in popcon_size:
  239 + # if size:
  240 + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
  241 + # cfg.popcon_programs = popcon_programs+"_"+size
  242 + run_strategy(cfg,user)
  243 +
  244 +def run_hybrid(user,cfg):
  245 + popcon_desktopapps = cfg.popcon_desktopapps
  246 + popcon_programs = cfg.popcon_programs
  247 + for strategy in hybrid:
  248 + cfg.strategy = strategy
  249 + for k in neighbors:
  250 + cfg.k_neighbors = k
  251 + #for size in popcon_size:
  252 + # if size:
  253 + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
  254 + # cfg.popcon_programs = popcon_programs+"_"+size
93 255 for size in profile_size:
94 256 cfg.profile_size = size
95   - for strategy in content_based:
96   - cfg.strategy = strategy
97   - label["description"] = "k1_bm25-profile-strategy-sample-n"
98   - label["values"] = ("%.2f-%d-%s-%.2f-%d" %
99   - (cfg.bm25_k1,cfg.profile_size,
100   - cfg.strategy,sample_proportion,n))
101   - run_iteration(user,cfg,label,sample)
102   - if "colaborative" in sys.argv or len(sys.argv)<2:
103   - for strategy in collaborative:
104   - cfg.strategy = strategy
105   - for size in popcon_size:
106   - cfg.popcon_desktopapps = cfg.popcon_desktopapps+size
107   - cfg.popcon_programs = cfg.popcon_programs+size
108   - for k in neighbors:
109   - cfg.k_neighbors = k
110   - k_str = "k"+str(cfg.k_neighbors)
111   - label["description"] = "k1_bm25-popcon-strategy-k-sample-n"
112   - label["values"] = ("%.2f-%s-%s-%s-%.2f-%d" %
113   - (cfg.bm25_k1,str(popcon_size),cfg.strategy,
114   - k_str,sample_proportion,n))
115   - run_iteration(user,cfg,label,sample)
  257 + run_strategy(cfg,user)
116 258  
117 259 if __name__ == '__main__':
118   - iterations = 10
119   - samples_proportion = [0.5, 0.6, 0.7, 0.8, 0.9]
120   - weights = ['bm25', 'trad']
121   - bm25_k1 = [1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
122   - content_based = ['cb','cbt','cbd','cbh',
123   - 'cb_eset','cbt_eset','cbd_eset','cbh_eset']
124   - collaborative = ['knn','knn_plus','knn_eset']
125   - hybrid = ['knnco','knnco_eset']
126   -
127   - profile_size = range(10,100,10)
128   - popcon_size = [1000,10000,50000,'full']
129   - neighbors = range(10,510,100)
130   -
131   - user = LocalSystem()
  260 + #user = LocalSystem()
132 261 #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
  262 +
  263 + cfg = Config()
  264 + user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
  265 + #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
  266 + user.filter_pkg_profile(cfg.pkgs_filter)
133 267 user.maximal_pkg_profile()
134   - for sample_proportion in samples_proportion:
135   - for n in range(iterations):
136   - # Fill user profile
137   - item_score = {}
138   - for pkg in user.pkg_profile:
139   - item_score[pkg] = user.item_score[pkg]
140   - # Prepare partition sample
141   - sample = {}
142   - sample_size = int(len(user.pkg_profile)*sample_proportion)
143   - for i in range(sample_size):
144   - key = random.choice(item_score.keys())
145   - sample[key] = item_score.pop(key)
146   - run_strategies(User(item_score),sample,n)
  268 +
  269 + if "content" in sys.argv or len(sys.argv)<2:
  270 + run_content(user,cfg)
  271 + if "collaborative" in sys.argv or len(sys.argv)<2:
  272 + run_collaborative(user,cfg)
  273 + if "hybrid" in sys.argv or len(sys.argv)<2:
  274 + run_hybrid(user,cfg)
... ...