Commit b33c0cb1890a68c50ca6511c5d63fc2ffebfa854

Authored by Tássia Camões Araújo
1 parent ccd4ef55
Exists in master and in 1 other branch add_vagrant

Up-to-date metrics experiments.

Showing 2 changed files with 106 additions and 101 deletions   Show diff stats
src/experiments/hybrid.py
... ... @@ -31,6 +31,8 @@ import random
31 31 import Gnuplot
32 32 import numpy
33 33  
  34 +#hybrid_strategies = ['knnco','knnco_eset']
  35 +
34 36 if __name__ == '__main__':
35 37 if len(sys.argv)<2:
36 38 print "Usage: hybrid strategy sample_file"
... ... @@ -38,9 +40,7 @@ if __name__ == &#39;__main__&#39;:
38 40  
39 41 iterations = 20
40 42 profile_size = [10,40,70,100,170,240]
41   - neighbor_size = [3,10,50,100,200,400]
42   -
43   - #hybrid_strategies = ['knnco','knnco_eset']
  43 + neighbor_size = [3,10,50,70,100,150,200]
44 44  
45 45 #iterations = 1
46 46 #profile_size = [10,20,30]
... ... @@ -55,55 +55,55 @@ if __name__ == &#39;__main__&#39;:
55 55 for line in f.readlines():
56 56 user_id = line.strip('\n')
57 57 population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
58   - sample_dir = ("results/hybrid/%s" % sample_str)
  58 + sample_dir = ("results/hybrid/%s/%s" % (sample_str,strategy))
59 59 if not os.path.exists(sample_dir):
60 60 os.makedirs(sample_dir)
61 61  
62 62 cfg.strategy = strategy
63   - p_20_summary = {}
  63 + p_10_summary = {}
64 64 f05_100_summary = {}
65   - c_20 = {}
  65 + c_10 = {}
66 66 c_100 = {}
67 67  
68 68 log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
69   - graph_20 = {}
  69 + graph_10 = {}
70 70 graph_100 = {}
71   - graph_20_jpg = {}
  71 + graph_10_jpg = {}
72 72 graph_100_jpg = {}
73   - comment_20 = {}
  73 + comment_10 = {}
74 74 comment_100 = {}
75 75 for k in neighbor_size:
76   - graph_20[k] = log_file+("-neighboorhod%.3d-020.png"%k)
77   - graph_100[k] = log_file+("-neighboorhod%.3d-100.png"%k)
78   - graph_20_jpg[k] = graph_20[k].strip(".png")+".jpg"
  76 + graph_10[k] = log_file+("-neighborhood%.3d-010.png"%k)
  77 + graph_100[k] = log_file+("-neighborhood%.3d-100.png"%k)
  78 + graph_10_jpg[k] = graph_10[k].strip(".png")+".jpg"
79 79 graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
80   - comment_20[k] = graph_20_jpg[k]+".comment"
  80 + comment_10[k] = graph_10_jpg[k]+".comment"
81 81 comment_100[k] = graph_100_jpg[k]+".comment"
82 82  
83   - with open(comment_20[k],'w') as f:
  83 + with open(comment_10[k],'w') as f:
84 84 f.write("# %s\n" % sample_str)
85   - f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
  85 + f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
86 86 (cfg.strategy,iterations))
87   - f.write("# neighboorhood\tprofile\tp_20\tc_20\n\n")
  87 + f.write("# neighborhood\tprofile\tmean_p_10\tdev_p_10\tc_10\n\n")
88 88 with open(comment_100[k],'w') as f:
89 89 f.write("# %s\n" % sample_str)
90 90 f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
91 91 (cfg.strategy,iterations))
92   - f.write("# neighboorhood\tprofile\tf05_100\tc_100\n\n")
  92 + f.write("# neighborhood\tprofile\tmean_f05_100\tdev_f05_100\tc_100\n\n")
93 93  
94   - c_20[k] = {}
  94 + c_10[k] = {}
95 95 c_100[k] = {}
96   - p_20_summary[k] = {}
  96 + p_10_summary[k] = {}
97 97 f05_100_summary[k] = {}
98 98 for size in profile_size:
99   - c_20[k][size] = set()
  99 + c_10[k][size] = set()
100 100 c_100[k][size] = set()
101   - p_20_summary[k][size] = []
  101 + p_10_summary[k][size] = []
102 102 f05_100_summary[k][size] = []
103   - with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'w') as f:
  103 + with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'w') as f:
104 104 f.write("# %s\n" % sample_str)
105   - f.write("# strategy %s-neighboorhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
106   - f.write("# p_20\t\tf05_100\n\n")
  105 + f.write("# strategy %s-neighborhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
  106 + f.write("# p_10\t\tf05_100\n\n")
107 107  
108 108 # main loop per user
109 109 for submission_file in population_sample:
... ... @@ -116,7 +116,7 @@ if __name__ == &#39;__main__&#39;:
116 116 cfg.profile_size = size
117 117 rec = Recommender(cfg)
118 118 repo_size = rec.items_repository.get_doccount()
119   - p_20 = []
  119 + p_10 = []
120 120 f05_100 = []
121 121 for n in range(iterations):
122 122 # Fill sample profile
... ... @@ -134,40 +134,42 @@ if __name__ == &#39;__main__&#39;:
134 134 if hasattr(recommendation,"ranking"):
135 135 ranking = recommendation.ranking
136 136 real = RecommendationResult(sample)
137   - predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
138   - evaluation = Evaluation(predicted_20,real,repo_size)
139   - p_20.append(evaluation.run(Precision()))
  137 + predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
  138 + evaluation = Evaluation(predicted_10,real,repo_size)
  139 + p_10.append(evaluation.run(Precision()))
140 140 predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
141 141 evaluation = Evaluation(predicted_100,real,repo_size)
142 142 f05_100.append(evaluation.run(F_score(0.5)))
143   - c_20[k][size] = c_20[k][size].union(recommendation.ranking[:20])
  143 + c_10[k][size] = c_10[k][size].union(recommendation.ranking[:10])
144 144 c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
145 145 # save summary
146   - if p_20:
147   - p_20_summary[k][size].append(sum(p_20)/len(p_20))
  146 + if p_10:
  147 + p_10_summary[k][size].append(numpy.mean(p_10))
148 148 if f05_100:
149   - f05_100_summary[k][size].append(sum(f05_100)/len(f05_100))
  149 + f05_100_summary[k][size].append(numpy.mean(f05_100))
150 150  
151   - with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'a') as f:
  151 + with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'a') as f:
152 152 f.write("%.4f\t\t%.4f\n" %
153   - ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
  153 + (numpy.mean(p_10),numpy.mean(f05_100)))
154 154  
155 155 # back to main flow
156   - coverage_20 = {}
  156 + coverage_10 = {}
157 157 coverage_100 = {}
158 158 for k in neighbor_size:
159   - coverage_20[k] = {}
  159 + coverage_10[k] = {}
160 160 coverage_100[k] = {}
161   - with open(comment_20[k],'a') as f:
  161 + with open(comment_10[k],'a') as f:
162 162 for size in profile_size:
163   - coverage_20[k][size] = len(c_20[k][size])/float(repo_size)
164   - f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
165   - (k,size,float(sum(p_20_summary[k][size]))/len(p_20_summary[k][size]),coverage_20[k][size]))
  163 + coverage_10[k][size] = len(c_10[k][size])/float(repo_size)
  164 + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
  165 + (k,size,numpy.mean(p_10_summary[k][size]),
  166 + numpy.std(p_10_summary[k][size]),coverage_10[k][size]))
166 167 with open(comment_100[k],'a') as f:
167 168 for size in profile_size:
168 169 coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
169   - f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
170   - (k,size,float(sum(f05_100_summary[k][size]))/len(f05_100_summary[k][size]),coverage_100[k][size]))
  170 + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
  171 + (k,size,numpy.mean(f05_100_summary[k][size]),
  172 + numpy.std(f05_100_summary[k][size]),coverage_100[k][size]))
171 173  
172 174 for k in neighbor_size:
173 175 # plot results summary
... ... @@ -175,23 +177,26 @@ if __name__ == &#39;__main__&#39;:
175 177 g('set style data lines')
176 178 g('set yrange [0:1.0]')
177 179 g.xlabel('Profile size')
178   - g.title("Setup: %s-neighboorhood%3d (threshold 20)" % (cfg.strategy,k))
179   - g.plot(Gnuplot.Data(sorted([[i,sum(p_20_summary[k][i])/len(p_20_summary[k][i])]
180   - for i in p_20_summary[k].keys()]),title="Precision"),
181   - Gnuplot.Data(sorted([[i,coverage_20[k][i]]
182   - for i in coverage_20[k].keys()]),title="Coverage"))
183   - g.hardcopy(graph_20[k],terminal="png")
184   - #commands.getoutput("convert -quality 100 %s %s" %
185   - # (graph_20[k],graph_20_jpg[k]))
  180 + g.title("Setup: %s-neighborhood%3d (threshold 10)" % (cfg.strategy,k))
  181 + g.plot(Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
  182 + for i in p_10_summary[k].keys()]),title="Precision"),
  183 + Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
  184 + for i in p_10_summary[k].keys()]),title="Deviation",
  185 + with_="yerrorbar lt 2 pt 6"),
  186 + Gnuplot.Data(sorted([[i,coverage_10[k][i]]
  187 + for i in coverage_10[k].keys()]),title="Coverage"))
  188 + g.hardcopy(graph_10[k],terminal="png")
  189 +
186 190 g = Gnuplot.Gnuplot()
187 191 g('set style data lines')
188 192 g('set yrange [0:1.0]')
189 193 g.xlabel('Profile size')
190   - g.title("Setup: %s-neighboorhood%3d (threshold 100)" % (cfg.strategy,k))
191   - g.plot(Gnuplot.Data(sorted([[i,sum(f05_100_summary[k][i])/len(f05_100_summary[k][i])]
  194 + g.title("Setup: %s-neighborhood%3d (threshold 100)" % (cfg.strategy,k))
  195 + g.plot(Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
192 196 for i in f05_100_summary[k].keys()]),title="F05"),
  197 + Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
  198 + for i in f05_100_summary[k].keys()]),title="Deviation",
  199 + with_="yerrorbar lt 2 pt 6"),
193 200 Gnuplot.Data(sorted([[i,coverage_100[k][i]]
194 201 for i in coverage_100[k].keys()]),title="Coverage"))
195 202 g.hardcopy(graph_100[k],terminal="png")
196   - #commands.getoutput("convert -quality 100 %s %s" %
197   - # (graph_100[k],graph_100_jpg[k]))
... ...
src/experiments/pure.py
... ... @@ -33,22 +33,21 @@ import numpy
33 33  
34 34 if __name__ == '__main__':
35 35 if len(sys.argv)<2:
36   - print "Usage: profile-suite strategy_category sample_file"
  36 + print "Usage: pure strategy_category sample_file"
37 37 exit(1)
38 38  
39 39 iterations = 20
40   - profile_size = [10,20,40,70,100,140,170,200,240]
41   - neighbor_size = [3,5,10,50,100,150,200,300,400,500]
  40 + profile_size = [10,20,40,60,80,100,140,170,200,240]
  41 + neighbor_size = [3,5,10,20,30,50,70,100,150,200]
42 42  
43 43 content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
44   - collaborative_strategies = ['knn_eset']#,'knn_eset','knn_plus']
45   - #collaborative_strategies = ['knn','knn_eset','knn_plus']
  44 + collaborative_strategies = ['knn_eset','knn','knn_plus']
46 45  
47 46 #iterations = 1
48 47 #profile_size = [10,20,30]
49   - #neighbor_size = [10,20,30]
  48 + #neighbor_size = [3,5,10,20,30,50]
50 49 #content_strategies = ['cb']
51   - #collaborative_strategies = ['knn_eset']
  50 + #collaborative_strategies = ['knn']
52 51  
53 52 strategy_category = sys.argv[1]
54 53 if strategy_category == "content":
... ... @@ -78,39 +77,39 @@ if __name__ == &#39;__main__&#39;:
78 77  
79 78 for strategy in strategies:
80 79 cfg.strategy = strategy
81   - p_20_summary = {}
  80 + p_10_summary = {}
82 81 f05_100_summary = {}
83   - c_20 = {}
  82 + c_10 = {}
84 83 c_100 = {}
85 84  
86 85 log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
87   - graph_20 = log_file+"-20.png"
  86 + graph_10 = log_file+"-10.png"
88 87 graph_100 = log_file+"-100.png"
89   - graph_20_jpg = graph_20.strip(".png")+".jpg"
  88 + graph_10_jpg = graph_10.strip(".png")+".jpg"
90 89 graph_100_jpg = graph_100.strip(".png")+".jpg"
91   - comment_20 = graph_20_jpg+".comment"
  90 + comment_10 = graph_10_jpg+".comment"
92 91 comment_100 = graph_100_jpg+".comment"
93 92  
94   - with open(comment_20,'w') as f:
  93 + with open(comment_10,'w') as f:
95 94 f.write("# sample %s\n" % sample_str)
96   - f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
  95 + f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
97 96 (cfg.strategy,iterations))
98   - f.write("# %s\tp_20\tc_20\n\n"%option_str)
  97 + f.write("# %s\tmean_p_10\tdev_p_10\tc_10\n\n"%option_str)
99 98 with open(comment_100,'w') as f:
100 99 f.write("# sample %s\n" % sample_str)
101 100 f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
102 101 (cfg.strategy,iterations))
103   - f.write("# %s\t\tf05_100\t\tc_100\n\n"%option_str)
  102 + f.write("# %s\t\tmean_f05_100\t\tdev_f05_100\t\tc_100\n\n"%option_str)
104 103  
105 104 for size in sizes:
106   - c_20[size] = set()
  105 + c_10[size] = set()
107 106 c_100[size] = set()
108   - p_20_summary[size] = []
  107 + p_10_summary[size] = []
109 108 f05_100_summary[size] = []
110 109 with open(log_file+"-%s%.3d"%(option_str,size),'w') as f:
111 110 f.write("# sample %s\n" % sample_str)
112 111 f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size))
113   - f.write("# p_20\tf05_100\n\n")
  112 + f.write("# p_10\tf05_100\n\n")
114 113  
115 114 # main loop per user
116 115 for submission_file in population_sample:
... ... @@ -122,7 +121,7 @@ if __name__ == &#39;__main__&#39;:
122 121 cfg.k_neighbors = size
123 122 rec = Recommender(cfg)
124 123 repo_size = rec.items_repository.get_doccount()
125   - p_20 = []
  124 + p_10 = []
126 125 f05_100 = []
127 126 for n in range(iterations):
128 127 # Fill sample profile
... ... @@ -140,60 +139,61 @@ if __name__ == &#39;__main__&#39;:
140 139 if hasattr(recommendation,"ranking"):
141 140 ranking = recommendation.ranking
142 141 real = RecommendationResult(sample)
143   - predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
144   - evaluation = Evaluation(predicted_20,real,repo_size)
145   - p_20.append(evaluation.run(Precision()))
  142 + predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
  143 + evaluation = Evaluation(predicted_10,real,repo_size)
  144 + p_10.append(evaluation.run(Precision()))
146 145 predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
147 146 evaluation = Evaluation(predicted_100,real,repo_size)
148 147 f05_100.append(evaluation.run(F_score(0.5)))
149   - c_20[size] = c_20[size].union(recommendation.ranking[:20])
  148 + c_10[size] = c_10[size].union(recommendation.ranking[:10])
150 149 c_100[size] = c_100[size].union(recommendation.ranking[:100])
151 150 # save summary
152   - if p_20:
153   - p_20_summary[size].append(sum(p_20)/len(p_20))
  151 + if p_10:
  152 + p_10_summary[size].append(numpy.mean(p_10))
154 153 if f05_100:
155   - f05_100_summary[size].append(sum(f05_100)/len(f05_100))
  154 + f05_100_summary[size].append(numpy.mean(f05_100))
156 155  
157 156 with open(log_file+"-%s%.3d"%(option_str,size),'a') as f:
158   - f.write("%.4f \t%.4f\n" %
159   - ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
  157 + f.write("%.4f \t%.4f\n" % (numpy.mean(p_10),numpy.mean(f05_100)))
160 158  
161 159 # back to main flow
162   - coverage_20 = {}
  160 + coverage_10 = {}
163 161 coverage_100 = {}
164   - with open(comment_20,'a') as f:
  162 + with open(comment_10,'a') as f:
165 163 for size in sizes:
166   - coverage_20[size] = len(c_20[size])/float(repo_size)
167   - f.write("%3d\t\t%.4f\t\t%.4f\n" %
168   - (size,float(sum(p_20_summary[size]))/len(p_20_summary[size]),coverage_20[size]))
  164 + coverage_10[size] = len(c_10[size])/float(repo_size)
  165 + f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  166 + (size,numpy.mean(p_10_summary[size]),numpy.std(p_10_summary[size]),coverage_10[size]))
169 167 with open(comment_100,'a') as f:
170 168 for size in sizes:
171 169 coverage_100[size] = len(c_100[size])/float(repo_size)
172   - f.write("%3d\t\t%.4f\t\t%.4f\n" %
173   - (size,float(sum(f05_100_summary[size]))/len(f05_100_summary[size]),coverage_100[size]))
  170 + f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  171 + (size,numpy.mean(f05_100_summary[size]),numpy.std(f05_100_summary[size]),coverage_100[size]))
174 172  
175 173 # plot results summary
176 174 g = Gnuplot.Gnuplot()
177 175 g('set style data lines')
178 176 g('set yrange [0:1.0]')
179 177 g.xlabel('%s size'%option_str.capitalize())
180   - g.title("Setup: %s (threshold 20)" % cfg.strategy)
181   - g.plot(Gnuplot.Data(sorted([[k,sum(p_20_summary[k])/len(p_20_summary[k])]
182   - for k in p_20_summary.keys()]),title="Precision"),
183   - Gnuplot.Data(sorted([[k,coverage_20[k]]
184   - for k in coverage_20.keys()]),title="Coverage"))
185   - g.hardcopy(graph_20,terminal="png")
186   - commands.getoutput("convert -quality 20 %s %s" %
187   - (graph_100,graph_20_jpg))
  178 + g.title("Setup: %s (threshold 10)" % cfg.strategy)
  179 + g.plot(Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
  180 + for k in p_10_summary.keys()]),title="Precision"),
  181 + Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
  182 + for k in p_10_summary.keys()]),title="Deviation",
  183 + with_="yerrorbar lt 2 pt 6"),
  184 + Gnuplot.Data(sorted([[k,coverage_10[k]]
  185 + for k in coverage_10.keys()]),title="Coverage"))
  186 + g.hardcopy(graph_10,terminal="png")
188 187 g = Gnuplot.Gnuplot()
189 188 g('set style data lines')
190 189 g('set yrange [0:1.0]')
191 190 g.xlabel('%s size'%option_str.capitalize())
192 191 g.title("Setup: %s (threshold 100)" % cfg.strategy)
193   - g.plot(Gnuplot.Data(sorted([[k,sum(f05_100_summary[k])/len(f05_100_summary[k])]
  192 + g.plot(Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
194 193 for k in f05_100_summary.keys()]),title="F05"),
  194 + Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
  195 + for k in f05_100_summary.keys()]),title="Deviation",
  196 + with_="yerrorbar lt 2 pt 6"),
195 197 Gnuplot.Data(sorted([[k,coverage_100[k]]
196 198 for k in coverage_100.keys()]),title="Coverage"))
197 199 g.hardcopy(graph_100,terminal="png")
198   - commands.getoutput("convert -quality 100 %s %s" %
199   - (graph_100,graph_100_jpg))
... ...