Up-to-date metrics experiments.

Tássia Camões Araújo
1 parent ccd4ef55
Showing 2 changed files with 106 additions and 101 deletions Show diff stats
src/experiments/hybrid.py
src/experiments/pure.py
@@ -31,6 +31,8 @@ import random
 import Gnuplot
 import numpy
  
+#hybrid_strategies = ['knnco','knnco_eset']
+
 if __name__ == '__main__':
     if len(sys.argv)<2:
         print "Usage: hybrid strategy sample_file"
@@ -38,9 +40,7 @@ if __name__ == &#39;__main__&#39;:
  
     iterations = 20
     profile_size = [10,40,70,100,170,240]
-    neighbor_size = [3,10,50,100,200,400]
-
-    #hybrid_strategies = ['knnco','knnco_eset']
+    neighbor_size = [3,10,50,70,100,150,200]
  
     #iterations = 1
     #profile_size = [10,20,30]
@@ -55,55 +55,55 @@ if __name__ == &#39;__main__&#39;:
         for line in f.readlines():
             user_id = line.strip('\n')
             population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
-    sample_dir = ("results/hybrid/%s" % sample_str)
+    sample_dir = ("results/hybrid/%s/%s" % (sample_str,strategy))
     if not os.path.exists(sample_dir):
         os.makedirs(sample_dir)
  
     cfg.strategy = strategy
-    p_20_summary = {}
+    p_10_summary = {}
     f05_100_summary = {}
-    c_20 = {}
+    c_10 = {}
     c_100 = {}
  
     log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
-    graph_20 = {}
+    graph_10 = {}
     graph_100 = {}
-    graph_20_jpg = {}
+    graph_10_jpg = {}
     graph_100_jpg = {}
-    comment_20 = {}
+    comment_10 = {}
     comment_100 = {}
     for k in neighbor_size:
-        graph_20[k] = log_file+("-neighboorhod%.3d-020.png"%k)
-        graph_100[k] = log_file+("-neighboorhod%.3d-100.png"%k)
-        graph_20_jpg[k] = graph_20[k].strip(".png")+".jpg"
+        graph_10[k] = log_file+("-neighborhood%.3d-010.png"%k)
+        graph_100[k] = log_file+("-neighborhood%.3d-100.png"%k)
+        graph_10_jpg[k] = graph_10[k].strip(".png")+".jpg"
         graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
-        comment_20[k] = graph_20_jpg[k]+".comment"
+        comment_10[k] = graph_10_jpg[k]+".comment"
         comment_100[k] = graph_100_jpg[k]+".comment"
  
-        with open(comment_20[k],'w') as f:
+        with open(comment_10[k],'w') as f:
             f.write("# %s\n" % sample_str)
-            f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
+            f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
                     (cfg.strategy,iterations))
-            f.write("# neighboorhood\tprofile\tp_20\tc_20\n\n")
+            f.write("# neighborhood\tprofile\tmean_p_10\tdev_p_10\tc_10\n\n")
         with open(comment_100[k],'w') as f:
             f.write("# %s\n" % sample_str)
             f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
                     (cfg.strategy,iterations))
-            f.write("# neighboorhood\tprofile\tf05_100\tc_100\n\n")
+            f.write("# neighborhood\tprofile\tmean_f05_100\tdev_f05_100\tc_100\n\n")
  
-        c_20[k] = {}
+        c_10[k] = {}
         c_100[k] = {}
-        p_20_summary[k] = {}
+        p_10_summary[k] = {}
         f05_100_summary[k] = {}
         for size in profile_size:
-            c_20[k][size] = set()
+            c_10[k][size] = set()
             c_100[k][size] = set()
-            p_20_summary[k][size] = []
+            p_10_summary[k][size] = []
             f05_100_summary[k][size] = []
-            with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'w') as f:
+            with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'w') as f:
                 f.write("# %s\n" % sample_str)
-                f.write("# strategy %s-neighboorhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
-                f.write("# p_20\t\tf05_100\n\n")
+                f.write("# strategy %s-neighborhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
+                f.write("# p_10\t\tf05_100\n\n")
  
     # main loop per user
     for submission_file in population_sample:
@@ -116,7 +116,7 @@ if __name__ == &#39;__main__&#39;:
                 cfg.profile_size = size
                 rec = Recommender(cfg)
                 repo_size = rec.items_repository.get_doccount()
-                p_20 = []
+                p_10 = []
                 f05_100 = []
                 for n in range(iterations):
                     # Fill sample profile
@@ -134,40 +134,42 @@ if __name__ == &#39;__main__&#39;:
                     if hasattr(recommendation,"ranking"):
                         ranking = recommendation.ranking
                         real = RecommendationResult(sample)
-                        predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
-                        evaluation = Evaluation(predicted_20,real,repo_size)
-                        p_20.append(evaluation.run(Precision()))
+                        predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
+                        evaluation = Evaluation(predicted_10,real,repo_size)
+                        p_10.append(evaluation.run(Precision()))
                         predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
                         evaluation = Evaluation(predicted_100,real,repo_size)
                         f05_100.append(evaluation.run(F_score(0.5)))
-                        c_20[k][size] = c_20[k][size].union(recommendation.ranking[:20])
+                        c_10[k][size] = c_10[k][size].union(recommendation.ranking[:10])
                         c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
                 # save summary
-                if p_20:
-                    p_20_summary[k][size].append(sum(p_20)/len(p_20))
+                if p_10:
+                    p_10_summary[k][size].append(numpy.mean(p_10))
                 if f05_100:
-                    f05_100_summary[k][size].append(sum(f05_100)/len(f05_100))
+                    f05_100_summary[k][size].append(numpy.mean(f05_100))
  
-                with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'a') as f:
+                with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'a') as f:
                     f.write("%.4f\t\t%.4f\n" %
-                            ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
+                            (numpy.mean(p_10),numpy.mean(f05_100)))
  
     # back to main flow
-    coverage_20 = {}
+    coverage_10 = {}
     coverage_100 = {}
     for k in neighbor_size:
-        coverage_20[k] = {}
+        coverage_10[k] = {}
         coverage_100[k] = {}
-        with open(comment_20[k],'a') as f:
+        with open(comment_10[k],'a') as f:
             for size in profile_size:
-                coverage_20[k][size] = len(c_20[k][size])/float(repo_size)
-                f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
-                        (k,size,float(sum(p_20_summary[k][size]))/len(p_20_summary[k][size]),coverage_20[k][size]))
+                coverage_10[k][size] = len(c_10[k][size])/float(repo_size)
+                f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
+                        (k,size,numpy.mean(p_10_summary[k][size]),
+                         numpy.std(p_10_summary[k][size]),coverage_10[k][size]))
         with open(comment_100[k],'a') as f:
             for size in profile_size:
                 coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
-                f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
-                        (k,size,float(sum(f05_100_summary[k][size]))/len(f05_100_summary[k][size]),coverage_100[k][size]))
+                f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
+                        (k,size,numpy.mean(f05_100_summary[k][size]),
+                         numpy.std(f05_100_summary[k][size]),coverage_100[k][size]))
  
     for k in neighbor_size:
         # plot results summary
@@ -175,23 +177,26 @@ if __name__ == &#39;__main__&#39;:
         g('set style data lines')
         g('set yrange [0:1.0]')
         g.xlabel('Profile size')
-        g.title("Setup: %s-neighboorhood%3d (threshold 20)" % (cfg.strategy,k))
-        g.plot(Gnuplot.Data(sorted([[i,sum(p_20_summary[k][i])/len(p_20_summary[k][i])]
-                                    for i in p_20_summary[k].keys()]),title="Precision"),
-               Gnuplot.Data(sorted([[i,coverage_20[k][i]]
-                                    for i in coverage_20[k].keys()]),title="Coverage"))
-        g.hardcopy(graph_20[k],terminal="png")
-        #commands.getoutput("convert -quality 100 %s %s" %
-        #                   (graph_20[k],graph_20_jpg[k]))
+        g.title("Setup: %s-neighborhood%3d (threshold 10)" % (cfg.strategy,k))
+        g.plot(Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
+                                    for i in p_10_summary[k].keys()]),title="Precision"),
+               Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
+                                    for i in p_10_summary[k].keys()]),title="Deviation",
+                                    with_="yerrorbar lt 2 pt 6"),
+               Gnuplot.Data(sorted([[i,coverage_10[k][i]]
+                                    for i in coverage_10[k].keys()]),title="Coverage"))
+        g.hardcopy(graph_10[k],terminal="png")
+
         g = Gnuplot.Gnuplot()
         g('set style data lines')
         g('set yrange [0:1.0]')
         g.xlabel('Profile size')
-        g.title("Setup: %s-neighboorhood%3d (threshold 100)" % (cfg.strategy,k))
-        g.plot(Gnuplot.Data(sorted([[i,sum(f05_100_summary[k][i])/len(f05_100_summary[k][i])]
+        g.title("Setup: %s-neighborhood%3d (threshold 100)" % (cfg.strategy,k))
+        g.plot(Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
                                     for i in f05_100_summary[k].keys()]),title="F05"),
+               Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
+                                    for i in f05_100_summary[k].keys()]),title="Deviation",
+                                    with_="yerrorbar lt 2 pt 6"),
                Gnuplot.Data(sorted([[i,coverage_100[k][i]]
                                     for i in coverage_100[k].keys()]),title="Coverage"))
         g.hardcopy(graph_100[k],terminal="png")
-        #commands.getoutput("convert -quality 100 %s %s" %
-        #                   (graph_100[k],graph_100_jpg[k]))
@@ -33,22 +33,21 @@ import numpy
  
 if __name__ == '__main__':
     if len(sys.argv)<2:
-        print "Usage: profile-suite strategy_category sample_file"
+        print "Usage: pure strategy_category sample_file"
         exit(1)
  
     iterations = 20
-    profile_size = [10,20,40,70,100,140,170,200,240]
-    neighbor_size = [3,5,10,50,100,150,200,300,400,500]
+    profile_size = [10,20,40,60,80,100,140,170,200,240]
+    neighbor_size = [3,5,10,20,30,50,70,100,150,200]
  
     content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
-    collaborative_strategies = ['knn_eset']#,'knn_eset','knn_plus']
-    #collaborative_strategies = ['knn','knn_eset','knn_plus']
+    collaborative_strategies = ['knn_eset','knn','knn_plus']
  
     #iterations = 1
     #profile_size = [10,20,30]
-    #neighbor_size = [10,20,30]
+    #neighbor_size = [3,5,10,20,30,50]
     #content_strategies = ['cb']
-    #collaborative_strategies = ['knn_eset']
+    #collaborative_strategies = ['knn']
  
     strategy_category = sys.argv[1]
     if strategy_category == "content":
@@ -78,39 +77,39 @@ if __name__ == &#39;__main__&#39;:
  
     for strategy in strategies:
         cfg.strategy = strategy
-        p_20_summary = {}
+        p_10_summary = {}
         f05_100_summary = {}
-        c_20 = {}
+        c_10 = {}
         c_100 = {}
  
         log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
-        graph_20 = log_file+"-20.png"
+        graph_10 = log_file+"-10.png"
         graph_100 = log_file+"-100.png"
-        graph_20_jpg = graph_20.strip(".png")+".jpg"
+        graph_10_jpg = graph_10.strip(".png")+".jpg"
         graph_100_jpg = graph_100.strip(".png")+".jpg"
-        comment_20 = graph_20_jpg+".comment"
+        comment_10 = graph_10_jpg+".comment"
         comment_100 = graph_100_jpg+".comment"
  
-        with open(comment_20,'w') as f:
+        with open(comment_10,'w') as f:
             f.write("# sample %s\n" % sample_str)
-            f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
+            f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
                     (cfg.strategy,iterations))
-            f.write("# %s\tp_20\tc_20\n\n"%option_str)
+            f.write("# %s\tmean_p_10\tdev_p_10\tc_10\n\n"%option_str)
         with open(comment_100,'w') as f:
             f.write("# sample %s\n" % sample_str)
             f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
                     (cfg.strategy,iterations))
-            f.write("# %s\t\tf05_100\t\tc_100\n\n"%option_str)
+            f.write("# %s\t\tmean_f05_100\t\tdev_f05_100\t\tc_100\n\n"%option_str)
  
         for size in sizes:
-            c_20[size] = set()
+            c_10[size] = set()
             c_100[size] = set()
-            p_20_summary[size] = []
+            p_10_summary[size] = []
             f05_100_summary[size] = []
             with open(log_file+"-%s%.3d"%(option_str,size),'w') as f:
                 f.write("# sample %s\n" % sample_str)
                 f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size))
-                f.write("# p_20\tf05_100\n\n")
+                f.write("# p_10\tf05_100\n\n")
  
         # main loop per user
         for submission_file in population_sample:
@@ -122,7 +121,7 @@ if __name__ == &#39;__main__&#39;:
                 cfg.k_neighbors = size
                 rec = Recommender(cfg)
                 repo_size = rec.items_repository.get_doccount()
-                p_20 = []
+                p_10 = []
                 f05_100 = []
                 for n in range(iterations):
                     # Fill sample profile
@@ -140,60 +139,61 @@ if __name__ == &#39;__main__&#39;:
                     if hasattr(recommendation,"ranking"):
                         ranking = recommendation.ranking
                         real = RecommendationResult(sample)
-                        predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
-                        evaluation = Evaluation(predicted_20,real,repo_size)
-                        p_20.append(evaluation.run(Precision()))
+                        predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
+                        evaluation = Evaluation(predicted_10,real,repo_size)
+                        p_10.append(evaluation.run(Precision()))
                         predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
                         evaluation = Evaluation(predicted_100,real,repo_size)
                         f05_100.append(evaluation.run(F_score(0.5)))
-                        c_20[size] = c_20[size].union(recommendation.ranking[:20])
+                        c_10[size] = c_10[size].union(recommendation.ranking[:10])
                         c_100[size] = c_100[size].union(recommendation.ranking[:100])
                 # save summary
-                if p_20:
-                    p_20_summary[size].append(sum(p_20)/len(p_20))
+                if p_10:
+                    p_10_summary[size].append(numpy.mean(p_10))
                 if f05_100:
-                    f05_100_summary[size].append(sum(f05_100)/len(f05_100))
+                    f05_100_summary[size].append(numpy.mean(f05_100))
  
                 with open(log_file+"-%s%.3d"%(option_str,size),'a') as f:
-                    f.write("%.4f \t%.4f\n" %
-                            ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
+                    f.write("%.4f \t%.4f\n" % (numpy.mean(p_10),numpy.mean(f05_100)))
  
         # back to main flow
-        coverage_20 = {}
+        coverage_10 = {}
         coverage_100 = {}
-        with open(comment_20,'a') as f:
+        with open(comment_10,'a') as f:
             for size in sizes:
-                coverage_20[size] = len(c_20[size])/float(repo_size)
-                f.write("%3d\t\t%.4f\t\t%.4f\n" %
-                        (size,float(sum(p_20_summary[size]))/len(p_20_summary[size]),coverage_20[size]))
+                coverage_10[size] = len(c_10[size])/float(repo_size)
+                f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
+                        (size,numpy.mean(p_10_summary[size]),numpy.std(p_10_summary[size]),coverage_10[size]))
         with open(comment_100,'a') as f:
             for size in sizes:
                 coverage_100[size] = len(c_100[size])/float(repo_size)
-                f.write("%3d\t\t%.4f\t\t%.4f\n" %
-                        (size,float(sum(f05_100_summary[size]))/len(f05_100_summary[size]),coverage_100[size]))
+                f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
+                        (size,numpy.mean(f05_100_summary[size]),numpy.std(f05_100_summary[size]),coverage_100[size]))
  
         # plot results summary
         g = Gnuplot.Gnuplot()
         g('set style data lines')
         g('set yrange [0:1.0]')
         g.xlabel('%s size'%option_str.capitalize())
-        g.title("Setup: %s (threshold 20)" % cfg.strategy)
-        g.plot(Gnuplot.Data(sorted([[k,sum(p_20_summary[k])/len(p_20_summary[k])]
-                                    for k in p_20_summary.keys()]),title="Precision"),
-               Gnuplot.Data(sorted([[k,coverage_20[k]]
-                                    for k in coverage_20.keys()]),title="Coverage"))
-        g.hardcopy(graph_20,terminal="png")
-        commands.getoutput("convert -quality 20 %s %s" %
-                           (graph_100,graph_20_jpg))
+        g.title("Setup: %s (threshold 10)" % cfg.strategy)
+        g.plot(Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
+                                    for k in p_10_summary.keys()]),title="Precision"),
+               Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
+                                    for k in p_10_summary.keys()]),title="Deviation",
+                                    with_="yerrorbar lt 2 pt 6"),
+               Gnuplot.Data(sorted([[k,coverage_10[k]]
+                                    for k in coverage_10.keys()]),title="Coverage"))
+        g.hardcopy(graph_10,terminal="png")
         g = Gnuplot.Gnuplot()
         g('set style data lines')
         g('set yrange [0:1.0]')
         g.xlabel('%s size'%option_str.capitalize())
         g.title("Setup: %s (threshold 100)" % cfg.strategy)
-        g.plot(Gnuplot.Data(sorted([[k,sum(f05_100_summary[k])/len(f05_100_summary[k])]
+        g.plot(Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
                                     for k in f05_100_summary.keys()]),title="F05"),
+               Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
+                                    for k in f05_100_summary.keys()]),title="Deviation",
+                                    with_="yerrorbar lt 2 pt 6"),
                Gnuplot.Data(sorted([[k,coverage_100[k]]
                                     for k in coverage_100.keys()]),title="Coverage"))
         g.hardcopy(graph_100,terminal="png")
-        commands.getoutput("convert -quality 100 %s %s" %
-                           (graph_100,graph_100_jpg))
...	...	@@ -31,6 +31,8 @@ import random
31	31	import Gnuplot
32	32	import numpy
33	33
	34	+#hybrid_strategies = ['knnco','knnco_eset']
	35	+
34	36	if __name__ == '__main__':
35	37	if len(sys.argv)<2:
36	38	print "Usage: hybrid strategy sample_file"
...	...	@@ -38,9 +40,7 @@ if __name__ == '__main__':
38	40
39	41	iterations = 20
40	42	profile_size = [10,40,70,100,170,240]
41		- neighbor_size = [3,10,50,100,200,400]
42		-
43		- #hybrid_strategies = ['knnco','knnco_eset']
	43	+ neighbor_size = [3,10,50,70,100,150,200]
44	44
45	45	#iterations = 1
46	46	#profile_size = [10,20,30]
...	...	@@ -55,55 +55,55 @@ if __name__ == '__main__':
55	55	for line in f.readlines():
56	56	user_id = line.strip('\n')
57	57	population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
58		- sample_dir = ("results/hybrid/%s" % sample_str)
	58	+ sample_dir = ("results/hybrid/%s/%s" % (sample_str,strategy))
59	59	if not os.path.exists(sample_dir):
60	60	os.makedirs(sample_dir)
61	61
62	62	cfg.strategy = strategy
63		- p_20_summary = {}
	63	+ p_10_summary = {}
64	64	f05_100_summary = {}
65		- c_20 = {}
	65	+ c_10 = {}
66	66	c_100 = {}
67	67
68	68	log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
69		- graph_20 = {}
	69	+ graph_10 = {}
70	70	graph_100 = {}
71		- graph_20_jpg = {}
	71	+ graph_10_jpg = {}
72	72	graph_100_jpg = {}
73		- comment_20 = {}
	73	+ comment_10 = {}
74	74	comment_100 = {}
75	75	for k in neighbor_size:
76		- graph_20[k] = log_file+("-neighboorhod%.3d-020.png"%k)
77		- graph_100[k] = log_file+("-neighboorhod%.3d-100.png"%k)
78		- graph_20_jpg[k] = graph_20[k].strip(".png")+".jpg"
	76	+ graph_10[k] = log_file+("-neighborhood%.3d-010.png"%k)
	77	+ graph_100[k] = log_file+("-neighborhood%.3d-100.png"%k)
	78	+ graph_10_jpg[k] = graph_10[k].strip(".png")+".jpg"
79	79	graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
80		- comment_20[k] = graph_20_jpg[k]+".comment"
	80	+ comment_10[k] = graph_10_jpg[k]+".comment"
81	81	comment_100[k] = graph_100_jpg[k]+".comment"
82	82
83		- with open(comment_20[k],'w') as f:
	83	+ with open(comment_10[k],'w') as f:
84	84	f.write("# %s\n" % sample_str)
85		- f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
	85	+ f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
86	86	(cfg.strategy,iterations))
87		- f.write("# neighboorhood\tprofile\tp_20\tc_20\n\n")
	87	+ f.write("# neighborhood\tprofile\tmean_p_10\tdev_p_10\tc_10\n\n")
88	88	with open(comment_100[k],'w') as f:
89	89	f.write("# %s\n" % sample_str)
90	90	f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
91	91	(cfg.strategy,iterations))
92		- f.write("# neighboorhood\tprofile\tf05_100\tc_100\n\n")
	92	+ f.write("# neighborhood\tprofile\tmean_f05_100\tdev_f05_100\tc_100\n\n")
93	93
94		- c_20[k] = {}
	94	+ c_10[k] = {}
95	95	c_100[k] = {}
96		- p_20_summary[k] = {}
	96	+ p_10_summary[k] = {}
97	97	f05_100_summary[k] = {}
98	98	for size in profile_size:
99		- c_20[k][size] = set()
	99	+ c_10[k][size] = set()
100	100	c_100[k][size] = set()
101		- p_20_summary[k][size] = []
	101	+ p_10_summary[k][size] = []
102	102	f05_100_summary[k][size] = []
103		- with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'w') as f:
	103	+ with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'w') as f:
104	104	f.write("# %s\n" % sample_str)
105		- f.write("# strategy %s-neighboorhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
106		- f.write("# p_20\t\tf05_100\n\n")
	105	+ f.write("# strategy %s-neighborhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
	106	+ f.write("# p_10\t\tf05_100\n\n")
107	107
108	108	# main loop per user
109	109	for submission_file in population_sample:
...	...	@@ -116,7 +116,7 @@ if __name__ == '__main__':
116	116	cfg.profile_size = size
117	117	rec = Recommender(cfg)
118	118	repo_size = rec.items_repository.get_doccount()
119		- p_20 = []
	119	+ p_10 = []
120	120	f05_100 = []
121	121	for n in range(iterations):
122	122	# Fill sample profile
...	...	@@ -134,40 +134,42 @@ if __name__ == '__main__':
134	134	if hasattr(recommendation,"ranking"):
135	135	ranking = recommendation.ranking
136	136	real = RecommendationResult(sample)
137		- predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
138		- evaluation = Evaluation(predicted_20,real,repo_size)
139		- p_20.append(evaluation.run(Precision()))
	137	+ predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
	138	+ evaluation = Evaluation(predicted_10,real,repo_size)
	139	+ p_10.append(evaluation.run(Precision()))
140	140	predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
141	141	evaluation = Evaluation(predicted_100,real,repo_size)
142	142	f05_100.append(evaluation.run(F_score(0.5)))
143		- c_20[k][size] = c_20[k][size].union(recommendation.ranking[:20])
	143	+ c_10[k][size] = c_10[k][size].union(recommendation.ranking[:10])
144	144	c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
145	145	# save summary
146		- if p_20:
147		- p_20_summary[k][size].append(sum(p_20)/len(p_20))
	146	+ if p_10:
	147	+ p_10_summary[k][size].append(numpy.mean(p_10))
148	148	if f05_100:
149		- f05_100_summary[k][size].append(sum(f05_100)/len(f05_100))
	149	+ f05_100_summary[k][size].append(numpy.mean(f05_100))
150	150
151		- with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'a') as f:
	151	+ with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'a') as f:
152	152	f.write("%.4f\t\t%.4f\n" %
153		- ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
	153	+ (numpy.mean(p_10),numpy.mean(f05_100)))
154	154
155	155	# back to main flow
156		- coverage_20 = {}
	156	+ coverage_10 = {}
157	157	coverage_100 = {}
158	158	for k in neighbor_size:
159		- coverage_20[k] = {}
	159	+ coverage_10[k] = {}
160	160	coverage_100[k] = {}
161		- with open(comment_20[k],'a') as f:
	161	+ with open(comment_10[k],'a') as f:
162	162	for size in profile_size:
163		- coverage_20[k][size] = len(c_20[k][size])/float(repo_size)
164		- f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
165		- (k,size,float(sum(p_20_summary[k][size]))/len(p_20_summary[k][size]),coverage_20[k][size]))
	163	+ coverage_10[k][size] = len(c_10[k][size])/float(repo_size)
	164	+ f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
	165	+ (k,size,numpy.mean(p_10_summary[k][size]),
	166	+ numpy.std(p_10_summary[k][size]),coverage_10[k][size]))
166	167	with open(comment_100[k],'a') as f:
167	168	for size in profile_size:
168	169	coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
169		- f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
170		- (k,size,float(sum(f05_100_summary[k][size]))/len(f05_100_summary[k][size]),coverage_100[k][size]))
	170	+ f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
	171	+ (k,size,numpy.mean(f05_100_summary[k][size]),
	172	+ numpy.std(f05_100_summary[k][size]),coverage_100[k][size]))
171	173
172	174	for k in neighbor_size:
173	175	# plot results summary
...	...	@@ -175,23 +177,26 @@ if __name__ == '__main__':
175	177	g('set style data lines')
176	178	g('set yrange [0:1.0]')
177	179	g.xlabel('Profile size')
178		- g.title("Setup: %s-neighboorhood%3d (threshold 20)" % (cfg.strategy,k))
179		- g.plot(Gnuplot.Data(sorted([[i,sum(p_20_summary[k][i])/len(p_20_summary[k][i])]
180		- for i in p_20_summary[k].keys()]),title="Precision"),
181		- Gnuplot.Data(sorted([[i,coverage_20[k][i]]
182		- for i in coverage_20[k].keys()]),title="Coverage"))
183		- g.hardcopy(graph_20[k],terminal="png")
184		- #commands.getoutput("convert -quality 100 %s %s" %
185		- # (graph_20[k],graph_20_jpg[k]))
	180	+ g.title("Setup: %s-neighborhood%3d (threshold 10)" % (cfg.strategy,k))
	181	+ g.plot(Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
	182	+ for i in p_10_summary[k].keys()]),title="Precision"),
	183	+ Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
	184	+ for i in p_10_summary[k].keys()]),title="Deviation",
	185	+ with_="yerrorbar lt 2 pt 6"),
	186	+ Gnuplot.Data(sorted([[i,coverage_10[k][i]]
	187	+ for i in coverage_10[k].keys()]),title="Coverage"))
	188	+ g.hardcopy(graph_10[k],terminal="png")
	189	+
186	190	g = Gnuplot.Gnuplot()
187	191	g('set style data lines')
188	192	g('set yrange [0:1.0]')
189	193	g.xlabel('Profile size')
190		- g.title("Setup: %s-neighboorhood%3d (threshold 100)" % (cfg.strategy,k))
191		- g.plot(Gnuplot.Data(sorted([[i,sum(f05_100_summary[k][i])/len(f05_100_summary[k][i])]
	194	+ g.title("Setup: %s-neighborhood%3d (threshold 100)" % (cfg.strategy,k))
	195	+ g.plot(Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
192	196	for i in f05_100_summary[k].keys()]),title="F05"),
	197	+ Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
	198	+ for i in f05_100_summary[k].keys()]),title="Deviation",
	199	+ with_="yerrorbar lt 2 pt 6"),
193	200	Gnuplot.Data(sorted([[i,coverage_100[k][i]]
194	201	for i in coverage_100[k].keys()]),title="Coverage"))
195	202	g.hardcopy(graph_100[k],terminal="png")
196		- #commands.getoutput("convert -quality 100 %s %s" %
197		- # (graph_100[k],graph_100_jpg[k]))
...	...
...	...	@@ -33,22 +33,21 @@ import numpy
33	33
34	34	if __name__ == '__main__':
35	35	if len(sys.argv)<2:
36		- print "Usage: profile-suite strategy_category sample_file"
	36	+ print "Usage: pure strategy_category sample_file"
37	37	exit(1)
38	38
39	39	iterations = 20
40		- profile_size = [10,20,40,70,100,140,170,200,240]
41		- neighbor_size = [3,5,10,50,100,150,200,300,400,500]
	40	+ profile_size = [10,20,40,60,80,100,140,170,200,240]
	41	+ neighbor_size = [3,5,10,20,30,50,70,100,150,200]
42	42
43	43	content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
44		- collaborative_strategies = ['knn_eset']#,'knn_eset','knn_plus']
45		- #collaborative_strategies = ['knn','knn_eset','knn_plus']
	44	+ collaborative_strategies = ['knn_eset','knn','knn_plus']
46	45
47	46	#iterations = 1
48	47	#profile_size = [10,20,30]
49		- #neighbor_size = [10,20,30]
	48	+ #neighbor_size = [3,5,10,20,30,50]
50	49	#content_strategies = ['cb']
51		- #collaborative_strategies = ['knn_eset']
	50	+ #collaborative_strategies = ['knn']
52	51
53	52	strategy_category = sys.argv[1]
54	53	if strategy_category == "content":
...	...	@@ -78,39 +77,39 @@ if __name__ == '__main__':
78	77
79	78	for strategy in strategies:
80	79	cfg.strategy = strategy
81		- p_20_summary = {}
	80	+ p_10_summary = {}
82	81	f05_100_summary = {}
83		- c_20 = {}
	82	+ c_10 = {}
84	83	c_100 = {}
85	84
86	85	log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
87		- graph_20 = log_file+"-20.png"
	86	+ graph_10 = log_file+"-10.png"
88	87	graph_100 = log_file+"-100.png"
89		- graph_20_jpg = graph_20.strip(".png")+".jpg"
	88	+ graph_10_jpg = graph_10.strip(".png")+".jpg"
90	89	graph_100_jpg = graph_100.strip(".png")+".jpg"
91		- comment_20 = graph_20_jpg+".comment"
	90	+ comment_10 = graph_10_jpg+".comment"
92	91	comment_100 = graph_100_jpg+".comment"
93	92
94		- with open(comment_20,'w') as f:
	93	+ with open(comment_10,'w') as f:
95	94	f.write("# sample %s\n" % sample_str)
96		- f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
	95	+ f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
97	96	(cfg.strategy,iterations))
98		- f.write("# %s\tp_20\tc_20\n\n"%option_str)
	97	+ f.write("# %s\tmean_p_10\tdev_p_10\tc_10\n\n"%option_str)
99	98	with open(comment_100,'w') as f:
100	99	f.write("# sample %s\n" % sample_str)
101	100	f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
102	101	(cfg.strategy,iterations))
103		- f.write("# %s\t\tf05_100\t\tc_100\n\n"%option_str)
	102	+ f.write("# %s\t\tmean_f05_100\t\tdev_f05_100\t\tc_100\n\n"%option_str)
104	103
105	104	for size in sizes:
106		- c_20[size] = set()
	105	+ c_10[size] = set()
107	106	c_100[size] = set()
108		- p_20_summary[size] = []
	107	+ p_10_summary[size] = []
109	108	f05_100_summary[size] = []
110	109	with open(log_file+"-%s%.3d"%(option_str,size),'w') as f:
111	110	f.write("# sample %s\n" % sample_str)
112	111	f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size))
113		- f.write("# p_20\tf05_100\n\n")
	112	+ f.write("# p_10\tf05_100\n\n")
114	113
115	114	# main loop per user
116	115	for submission_file in population_sample:
...	...	@@ -122,7 +121,7 @@ if __name__ == '__main__':
122	121	cfg.k_neighbors = size
123	122	rec = Recommender(cfg)
124	123	repo_size = rec.items_repository.get_doccount()
125		- p_20 = []
	124	+ p_10 = []
126	125	f05_100 = []
127	126	for n in range(iterations):
128	127	# Fill sample profile
...	...	@@ -140,60 +139,61 @@ if __name__ == '__main__':
140	139	if hasattr(recommendation,"ranking"):
141	140	ranking = recommendation.ranking
142	141	real = RecommendationResult(sample)
143		- predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
144		- evaluation = Evaluation(predicted_20,real,repo_size)
145		- p_20.append(evaluation.run(Precision()))
	142	+ predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
	143	+ evaluation = Evaluation(predicted_10,real,repo_size)
	144	+ p_10.append(evaluation.run(Precision()))
146	145	predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
147	146	evaluation = Evaluation(predicted_100,real,repo_size)
148	147	f05_100.append(evaluation.run(F_score(0.5)))
149		- c_20[size] = c_20[size].union(recommendation.ranking[:20])
	148	+ c_10[size] = c_10[size].union(recommendation.ranking[:10])
150	149	c_100[size] = c_100[size].union(recommendation.ranking[:100])
151	150	# save summary
152		- if p_20:
153		- p_20_summary[size].append(sum(p_20)/len(p_20))
	151	+ if p_10:
	152	+ p_10_summary[size].append(numpy.mean(p_10))
154	153	if f05_100:
155		- f05_100_summary[size].append(sum(f05_100)/len(f05_100))
	154	+ f05_100_summary[size].append(numpy.mean(f05_100))
156	155
157	156	with open(log_file+"-%s%.3d"%(option_str,size),'a') as f:
158		- f.write("%.4f \t%.4f\n" %
159		- ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
	157	+ f.write("%.4f \t%.4f\n" % (numpy.mean(p_10),numpy.mean(f05_100)))
160	158
161	159	# back to main flow
162		- coverage_20 = {}
	160	+ coverage_10 = {}
163	161	coverage_100 = {}
164		- with open(comment_20,'a') as f:
	162	+ with open(comment_10,'a') as f:
165	163	for size in sizes:
166		- coverage_20[size] = len(c_20[size])/float(repo_size)
167		- f.write("%3d\t\t%.4f\t\t%.4f\n" %
168		- (size,float(sum(p_20_summary[size]))/len(p_20_summary[size]),coverage_20[size]))
	164	+ coverage_10[size] = len(c_10[size])/float(repo_size)
	165	+ f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
	166	+ (size,numpy.mean(p_10_summary[size]),numpy.std(p_10_summary[size]),coverage_10[size]))
169	167	with open(comment_100,'a') as f:
170	168	for size in sizes:
171	169	coverage_100[size] = len(c_100[size])/float(repo_size)
172		- f.write("%3d\t\t%.4f\t\t%.4f\n" %
173		- (size,float(sum(f05_100_summary[size]))/len(f05_100_summary[size]),coverage_100[size]))
	170	+ f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
	171	+ (size,numpy.mean(f05_100_summary[size]),numpy.std(f05_100_summary[size]),coverage_100[size]))
174	172
175	173	# plot results summary
176	174	g = Gnuplot.Gnuplot()
177	175	g('set style data lines')
178	176	g('set yrange [0:1.0]')
179	177	g.xlabel('%s size'%option_str.capitalize())
180		- g.title("Setup: %s (threshold 20)" % cfg.strategy)
181		- g.plot(Gnuplot.Data(sorted([[k,sum(p_20_summary[k])/len(p_20_summary[k])]
182		- for k in p_20_summary.keys()]),title="Precision"),
183		- Gnuplot.Data(sorted([[k,coverage_20[k]]
184		- for k in coverage_20.keys()]),title="Coverage"))
185		- g.hardcopy(graph_20,terminal="png")
186		- commands.getoutput("convert -quality 20 %s %s" %
187		- (graph_100,graph_20_jpg))
	178	+ g.title("Setup: %s (threshold 10)" % cfg.strategy)
	179	+ g.plot(Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
	180	+ for k in p_10_summary.keys()]),title="Precision"),
	181	+ Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
	182	+ for k in p_10_summary.keys()]),title="Deviation",
	183	+ with_="yerrorbar lt 2 pt 6"),
	184	+ Gnuplot.Data(sorted([[k,coverage_10[k]]
	185	+ for k in coverage_10.keys()]),title="Coverage"))
	186	+ g.hardcopy(graph_10,terminal="png")
188	187	g = Gnuplot.Gnuplot()
189	188	g('set style data lines')
190	189	g('set yrange [0:1.0]')
191	190	g.xlabel('%s size'%option_str.capitalize())
192	191	g.title("Setup: %s (threshold 100)" % cfg.strategy)
193		- g.plot(Gnuplot.Data(sorted([[k,sum(f05_100_summary[k])/len(f05_100_summary[k])]
	192	+ g.plot(Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
194	193	for k in f05_100_summary.keys()]),title="F05"),
	194	+ Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
	195	+ for k in f05_100_summary.keys()]),title="Deviation",
	196	+ with_="yerrorbar lt 2 pt 6"),
195	197	Gnuplot.Data(sorted([[k,coverage_100[k]]
196	198	for k in coverage_100.keys()]),title="Coverage"))
197	199	g.hardcopy(graph_100,terminal="png")
198		- commands.getoutput("convert -quality 100 %s %s" %
199		- (graph_100,graph_100_jpg))
...	...