Experiments refactoring.

Tássia Camões Araújo
1 parent 1aed15a5
Showing 2 changed files with 109 additions and 109 deletions Show diff stats
src/evaluation.py
src/experiments/strategies-suite.py
@@ -294,6 +294,10 @@ class CrossValidation:
             round_user = User(cross_item_score)
             result_size = int(self.recommender.items_repository.get_doccount()*
                               self.result_proportion)
+            logging.debug("size %d" % result_size)
+            if not result_size:
+                logging.critical("Recommendation size is zero.")
+                raise Error
             predicted_result = self.recommender.get_recommendation(round_user,result_size)
             if not predicted_result.size:
                 logging.critical("No recommendation produced. Abort cross-validation.")
@@ -30,121 +30,117 @@ import logging
 import random
 import Gnuplot
  
-def run_iteration(label,cfg,sample_proportion,n):
+def write_recall_log(label,sample,recommendation,log_file):
+    # Write recall log
+    output = open(log_file,'w')
+    output.write("# %s\n" % label["description"])
+    output.write("# %s\n" % label["values"])
+    notfound = []
+    ranks = []
+    for pkg in sample.keys():
+        if pkg in recommendation.ranking:
+            ranks.append(recommendation.ranking.index(pkg))
+        else:
+            notfound.append(pkg)
+    for r in sorted(ranks):
+        output.write(str(r)+"\n")
+    if notfound:
+        output.write("Out of recommendation:\n")
+        for pkg in notfound:
+            output.write(pkg+"\n")
+    output.close()
+
+def plot_summary(sample,recommendation,repo_size,log_file):
+    # Plot metrics summary
+    accuracy = []
+    precision = []
+    recall = []
+    f1 = []
+    g = Gnuplot.Gnuplot()
+    g('set style data lines')
+    g.xlabel('Recommendation size')
+    for size in range(1,len(recommendation.ranking)+1,100):
+        predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
+        real = RecommendationResult(sample)
+        evaluation = Evaluation(predicted,real,repo_size)
+        accuracy.append([size,evaluation.run(Accuracy())])
+        precision.append([size,evaluation.run(Precision())])
+        recall.append([size,evaluation.run(Recall())])
+        f1.append([size,evaluation.run(F1())])
+
+    g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
+           Gnuplot.Data(precision,title="Precision"),
+           Gnuplot.Data(recall,title="Recall"),
+           Gnuplot.Data(f1,title="F1"))
+    g.hardcopy(log_file+"-plot.ps", terminal="postscript")
+    g.hardcopy(log_file+"-plot.ps", terminal="postscript")
+
+def run_iteration(user,cfg,label,sample):
     rec = Recommender(cfg)
     repo_size = rec.items_repository.get_doccount()
-    user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters,"desktop"))
-    print "profile",user.pkg_profile
-    user.maximal_pkg_profile()
-    sample_size = int(len(user.pkg_profile)*sample_proportion)
-    for n in range(iteration):
-        item_score = dict.fromkeys(user.pkg_profile,1)
-        # Prepare partition
-        sample = {}
-        for i in range(sample_size):
-             key = random.choice(item_score.keys())
-             sample[key] = item_score.pop(key)
-        # Get full recommendation
-        user = User(item_score)
-        recommendation = rec.get_recommendation(user,repo_size)
-        # Write recall log
-        log_file = "results/strategies/"+label["values"]
-        output = open(log_file,'w')
-        output.write("# %s\n" % label["description"])
-        output.write("# %s\n" % label["values"])
-        notfound = []
-        ranks = []
-        for pkg in sample.keys():
-            if pkg in recommendation.ranking:
-                ranks.append(recommendation.ranking.index(pkg))
-            else:
-                notfound.append(pkg)
-        for r in sorted(ranks):
-            output.write(str(r)+"\n")
-        if notfound:
-            output.write("Out of recommendation:\n")
-            for pkg in notfound:
-                output.write(pkg+"\n")
-        output.close()
-        # Plot metrics summary
-        accuracy = []
-        precision = []
-        recall = []
-        f1 = []
-        g = Gnuplot.Gnuplot()
-        g('set style data lines')
-        g.xlabel('Recommendation size')
-        for size in range(1,len(recommendation.ranking)+1,100):
-            predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
-            real = RecommendationResult(sample)
-            evaluation = Evaluation(predicted,real,repo_size)
-            accuracy.append([size,evaluation.run(Accuracy())])
-            precision.append([size,evaluation.run(Precision())])
-            recall.append([size,evaluation.run(Recall())])
-            f1.append([size,evaluation.run(F1())])
-
-        g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
-               Gnuplot.Data(precision,title="Precision"),
-               Gnuplot.Data(recall,title="Recall"),
-               Gnuplot.Data(f1,title="F1"))
-        g.hardcopy(log_file+"-plot.ps", enhanced=1, color=1)
+    recommendation = rec.get_recommendation(user,repo_size)
+    log_file = "results/strategies/"+label["values"]
+    write_recall_log(label,sample,recommendation,log_file)
+    plot_summary(sample,recommendation,repo_size,log_file)
  
+def run_strategies(user,sample,n):
+    cfg = Config()
+    label = {}
+    sample_proportion = (len(sample)/len(user.pkg_profile)+len(sample))
+    for k in bm25_k1:
+        cfg.bm25_k1 = k
+        if "content" in sys.argv or len(sys.argv)<2:
+            for size in profile_size:
+                cfg.profile_size = size
+                for strategy in content_based:
+                    cfg.strategy = strategy
+                    label["description"] = "k1_bm25-profile-strategy-sample-n"
+                    label["values"] = ("%.2f-%d-%s-%.2f-%d" %
+                                       (cfg.bm25_k1,cfg.profile_size,
+                                        cfg.strategy,sample_proportion,n))
+                    run_iteration(user,cfg,label,sample)
+        if "colaborative" in sys.argv or len(sys.argv)<2:
+            for strategy in collaborative:
+                cfg.strategy = strategy
+                for size in popcon_size:
+                    cfg.popcon_desktopapps = cfg.popcon_desktopapps+size
+                    cfg.popcon_programs = cfg.popcon_programs+size
+                    for k in neighbors:
+                        cfg.k_neighbors = k
+                        k_str = "k"+str(cfg.k_neighbors)
+                        label["description"] = "k1_bm25-popcon-strategy-k-sample-n"
+                        label["values"] = ("%.2f-%s-%s-%s-%.2f-%d" %
+                                           (cfg.bm25_k1,str(popcon_size),cfg.strategy,
+                                            k_str,sample_proportion,n))
+                        run_iteration(user,cfg,label,sample)
  
 if __name__ == '__main__':
-    iteration = 10
+    iterations = 10
     samples_proportion = [0.5, 0.6, 0.7, 0.8, 0.9]
     weights = ['bm25', 'trad']
-    cb_strategies = ['cb','cbt','cbd']
-    #cb_strategies = []
-    profile_size = range(10,100,10)
-    items_repository = ["data/AppAxi","/var/lib/apt-xapian-index/index"]
-    users_repository = ["data/popcon_index_full","data/popcon_index-50000",
-                        "data/popcon_index_10000","data/popcon_index_1000"]
-    users_repository = []
-    neighbors = range(10,1010,100)
+    bm25_k1 = [1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
+    content_based = ['cb','cbt','cbd','cbh',
+                     'cb_eset','cbt_eset','cbd_eset','cbh_eset']
+    collaborative = ['knn','knn_plus','knn_eset']
+    hybrid = ['knnco','knnco_eset']
  
-    cfg = Config()
-    cfg.index_mode = "old"
-    label = {}
+    profile_size = range(10,100,10)
+    popcon_size = [1000,10000,50000,'full']
+    neighbors = range(10,510,100)
  
-    for w in weights:
-        cfg.weight = w
-        for items_repo in items_repository:
-            cfg.axi = items_repo
-            if "App" in cfg.axi:
-                axi_str = "axiapp"
-            else:
-                axi_str = "axifull"
-            for sample_proportion in samples_proportion:
-                if "content" in sys.argv or len(sys.argv)<2:
-                    for size in profile_size:
-                        cfg.profile_size = size
-                        for strategy in cb_strategies:
-                            cfg.strategy = strategy
-                            for n in range(iteration):
-                                label["description"] = "weight-axi-profile-strategy-sample-n"
-                                label["values"] = ("%s-%s-%d-%s-%.2f-%d" %
-                                                   (cfg.weight,axi_str,cfg.profile_size,
-                                                    cfg.strategy,sample_proportion,n))
-                                run_iteration(label,cfg,sample_proportion,n)
-                if "colaborative" in sys.argv or len(sys.argv)<2:
-                    cfg.strategy = "col"
-                    for users_repo in users_repository:
-                        cfg.popcon_index = users_repo
-                        for k in neighbors:
-                            cfg.k_neighbors = k
-                            for n in range(iteration):
-                                k_str = "k"+str(cfg.k_neighbors)
-                                if "full" in cfg.popcon_index:
-                                    popcon_str = "popfull"
-                                if "50000" in cfg.popcon_index:
-                                    popcon_str = "pop50000"
-                                if "10000" in cfg.popcon_index:
-                                    popcon_str = "pop10000"
-                                if "1000" in cfg.popcon_index:
-                                    popcon_str = "pop1000"
-                                label["description"] = "weight-axi-popcon-profile-strategy-k-sample-n"
-                                label["values"] = ("%s-%s-%s-%d-%s-%s-%.2f-%d" %
-                                                   (cfg.weight,axi_str,popcon_str,cfg.profile_size,
-                                                    cfg.strategy,k_str,sample_proportion,n))
-                                run_iteration(label,cfg,sample_proportion,n)
+    user = LocalSystem()
+    #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
+    user.maximal_pkg_profile()
+    for sample_proportion in samples_proportion:
+        for n in range(iterations):
+            # Fill user profile
+            item_score = {}
+            for pkg in user.pkg_profile:
+                item_score[pkg] = user.item_score[pkg]
+            # Prepare partition sample
+            sample = {}
+            sample_size = int(len(user.pkg_profile)*sample_proportion)
+            for i in range(sample_size):
+                 key = random.choice(item_score.keys())
+                 sample[key] = item_score.pop(key)
+            run_strategies(User(item_score),sample,n)
...	...	@@ -294,6 +294,10 @@ class CrossValidation:
294	294	round_user = User(cross_item_score)
295	295	result_size = int(self.recommender.items_repository.get_doccount()*
296	296	self.result_proportion)
	297	+ logging.debug("size %d" % result_size)
	298	+ if not result_size:
	299	+ logging.critical("Recommendation size is zero.")
	300	+ raise Error
297	301	predicted_result = self.recommender.get_recommendation(round_user,result_size)
298	302	if not predicted_result.size:
299	303	logging.critical("No recommendation produced. Abort cross-validation.")
...	...
...	...	@@ -30,121 +30,117 @@ import logging
30	30	import random
31	31	import Gnuplot
32	32
33		-def run_iteration(label,cfg,sample_proportion,n):
	33	+def write_recall_log(label,sample,recommendation,log_file):
	34	+ # Write recall log
	35	+ output = open(log_file,'w')
	36	+ output.write("# %s\n" % label["description"])
	37	+ output.write("# %s\n" % label["values"])
	38	+ notfound = []
	39	+ ranks = []
	40	+ for pkg in sample.keys():
	41	+ if pkg in recommendation.ranking:
	42	+ ranks.append(recommendation.ranking.index(pkg))
	43	+ else:
	44	+ notfound.append(pkg)
	45	+ for r in sorted(ranks):
	46	+ output.write(str(r)+"\n")
	47	+ if notfound:
	48	+ output.write("Out of recommendation:\n")
	49	+ for pkg in notfound:
	50	+ output.write(pkg+"\n")
	51	+ output.close()
	52	+
	53	+def plot_summary(sample,recommendation,repo_size,log_file):
	54	+ # Plot metrics summary
	55	+ accuracy = []
	56	+ precision = []
	57	+ recall = []
	58	+ f1 = []
	59	+ g = Gnuplot.Gnuplot()
	60	+ g('set style data lines')
	61	+ g.xlabel('Recommendation size')
	62	+ for size in range(1,len(recommendation.ranking)+1,100):
	63	+ predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
	64	+ real = RecommendationResult(sample)
	65	+ evaluation = Evaluation(predicted,real,repo_size)
	66	+ accuracy.append([size,evaluation.run(Accuracy())])
	67	+ precision.append([size,evaluation.run(Precision())])
	68	+ recall.append([size,evaluation.run(Recall())])
	69	+ f1.append([size,evaluation.run(F1())])
	70	+
	71	+ g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
	72	+ Gnuplot.Data(precision,title="Precision"),
	73	+ Gnuplot.Data(recall,title="Recall"),
	74	+ Gnuplot.Data(f1,title="F1"))
	75	+ g.hardcopy(log_file+"-plot.ps", terminal="postscript")
	76	+ g.hardcopy(log_file+"-plot.ps", terminal="postscript")
	77	+
	78	+def run_iteration(user,cfg,label,sample):
34	79	rec = Recommender(cfg)
35	80	repo_size = rec.items_repository.get_doccount()
36		- user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters,"desktop"))
37		- print "profile",user.pkg_profile
38		- user.maximal_pkg_profile()
39		- sample_size = int(len(user.pkg_profile)*sample_proportion)
40		- for n in range(iteration):
41		- item_score = dict.fromkeys(user.pkg_profile,1)
42		- # Prepare partition
43		- sample = {}
44		- for i in range(sample_size):
45		- key = random.choice(item_score.keys())
46		- sample[key] = item_score.pop(key)
47		- # Get full recommendation
48		- user = User(item_score)
49		- recommendation = rec.get_recommendation(user,repo_size)
50		- # Write recall log
51		- log_file = "results/strategies/"+label["values"]
52		- output = open(log_file,'w')
53		- output.write("# %s\n" % label["description"])
54		- output.write("# %s\n" % label["values"])
55		- notfound = []
56		- ranks = []
57		- for pkg in sample.keys():
58		- if pkg in recommendation.ranking:
59		- ranks.append(recommendation.ranking.index(pkg))
60		- else:
61		- notfound.append(pkg)
62		- for r in sorted(ranks):
63		- output.write(str(r)+"\n")
64		- if notfound:
65		- output.write("Out of recommendation:\n")
66		- for pkg in notfound:
67		- output.write(pkg+"\n")
68		- output.close()
69		- # Plot metrics summary
70		- accuracy = []
71		- precision = []
72		- recall = []
73		- f1 = []
74		- g = Gnuplot.Gnuplot()
75		- g('set style data lines')
76		- g.xlabel('Recommendation size')
77		- for size in range(1,len(recommendation.ranking)+1,100):
78		- predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
79		- real = RecommendationResult(sample)
80		- evaluation = Evaluation(predicted,real,repo_size)
81		- accuracy.append([size,evaluation.run(Accuracy())])
82		- precision.append([size,evaluation.run(Precision())])
83		- recall.append([size,evaluation.run(Recall())])
84		- f1.append([size,evaluation.run(F1())])
85		-
86		- g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
87		- Gnuplot.Data(precision,title="Precision"),
88		- Gnuplot.Data(recall,title="Recall"),
89		- Gnuplot.Data(f1,title="F1"))
90		- g.hardcopy(log_file+"-plot.ps", enhanced=1, color=1)
	81	+ recommendation = rec.get_recommendation(user,repo_size)
	82	+ log_file = "results/strategies/"+label["values"]
	83	+ write_recall_log(label,sample,recommendation,log_file)
	84	+ plot_summary(sample,recommendation,repo_size,log_file)
91	85
	86	+def run_strategies(user,sample,n):
	87	+ cfg = Config()
	88	+ label = {}
	89	+ sample_proportion = (len(sample)/len(user.pkg_profile)+len(sample))
	90	+ for k in bm25_k1:
	91	+ cfg.bm25_k1 = k
	92	+ if "content" in sys.argv or len(sys.argv)<2:
	93	+ for size in profile_size:
	94	+ cfg.profile_size = size
	95	+ for strategy in content_based:
	96	+ cfg.strategy = strategy
	97	+ label["description"] = "k1_bm25-profile-strategy-sample-n"
	98	+ label["values"] = ("%.2f-%d-%s-%.2f-%d" %
	99	+ (cfg.bm25_k1,cfg.profile_size,
	100	+ cfg.strategy,sample_proportion,n))
	101	+ run_iteration(user,cfg,label,sample)
	102	+ if "colaborative" in sys.argv or len(sys.argv)<2:
	103	+ for strategy in collaborative:
	104	+ cfg.strategy = strategy
	105	+ for size in popcon_size:
	106	+ cfg.popcon_desktopapps = cfg.popcon_desktopapps+size
	107	+ cfg.popcon_programs = cfg.popcon_programs+size
	108	+ for k in neighbors:
	109	+ cfg.k_neighbors = k
	110	+ k_str = "k"+str(cfg.k_neighbors)
	111	+ label["description"] = "k1_bm25-popcon-strategy-k-sample-n"
	112	+ label["values"] = ("%.2f-%s-%s-%s-%.2f-%d" %
	113	+ (cfg.bm25_k1,str(popcon_size),cfg.strategy,
	114	+ k_str,sample_proportion,n))
	115	+ run_iteration(user,cfg,label,sample)
92	116
93	117	if __name__ == '__main__':
94		- iteration = 10
	118	+ iterations = 10
95	119	samples_proportion = [0.5, 0.6, 0.7, 0.8, 0.9]
96	120	weights = ['bm25', 'trad']
97		- cb_strategies = ['cb','cbt','cbd']
98		- #cb_strategies = []
99		- profile_size = range(10,100,10)
100		- items_repository = ["data/AppAxi","/var/lib/apt-xapian-index/index"]
101		- users_repository = ["data/popcon_index_full","data/popcon_index-50000",
102		- "data/popcon_index_10000","data/popcon_index_1000"]
103		- users_repository = []
104		- neighbors = range(10,1010,100)
	121	+ bm25_k1 = [1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
	122	+ content_based = ['cb','cbt','cbd','cbh',
	123	+ 'cb_eset','cbt_eset','cbd_eset','cbh_eset']
	124	+ collaborative = ['knn','knn_plus','knn_eset']
	125	+ hybrid = ['knnco','knnco_eset']
105	126
106		- cfg = Config()
107		- cfg.index_mode = "old"
108		- label = {}
	127	+ profile_size = range(10,100,10)
	128	+ popcon_size = [1000,10000,50000,'full']
	129	+ neighbors = range(10,510,100)
109	130
110		- for w in weights:
111		- cfg.weight = w
112		- for items_repo in items_repository:
113		- cfg.axi = items_repo
114		- if "App" in cfg.axi:
115		- axi_str = "axiapp"
116		- else:
117		- axi_str = "axifull"
118		- for sample_proportion in samples_proportion:
119		- if "content" in sys.argv or len(sys.argv)<2:
120		- for size in profile_size:
121		- cfg.profile_size = size
122		- for strategy in cb_strategies:
123		- cfg.strategy = strategy
124		- for n in range(iteration):
125		- label["description"] = "weight-axi-profile-strategy-sample-n"
126		- label["values"] = ("%s-%s-%d-%s-%.2f-%d" %
127		- (cfg.weight,axi_str,cfg.profile_size,
128		- cfg.strategy,sample_proportion,n))
129		- run_iteration(label,cfg,sample_proportion,n)
130		- if "colaborative" in sys.argv or len(sys.argv)<2:
131		- cfg.strategy = "col"
132		- for users_repo in users_repository:
133		- cfg.popcon_index = users_repo
134		- for k in neighbors:
135		- cfg.k_neighbors = k
136		- for n in range(iteration):
137		- k_str = "k"+str(cfg.k_neighbors)
138		- if "full" in cfg.popcon_index:
139		- popcon_str = "popfull"
140		- if "50000" in cfg.popcon_index:
141		- popcon_str = "pop50000"
142		- if "10000" in cfg.popcon_index:
143		- popcon_str = "pop10000"
144		- if "1000" in cfg.popcon_index:
145		- popcon_str = "pop1000"
146		- label["description"] = "weight-axi-popcon-profile-strategy-k-sample-n"
147		- label["values"] = ("%s-%s-%s-%d-%s-%s-%.2f-%d" %
148		- (cfg.weight,axi_str,popcon_str,cfg.profile_size,
149		- cfg.strategy,k_str,sample_proportion,n))
150		- run_iteration(label,cfg,sample_proportion,n)
	131	+ user = LocalSystem()
	132	+ #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
	133	+ user.maximal_pkg_profile()
	134	+ for sample_proportion in samples_proportion:
	135	+ for n in range(iterations):
	136	+ # Fill user profile
	137	+ item_score = {}
	138	+ for pkg in user.pkg_profile:
	139	+ item_score[pkg] = user.item_score[pkg]
	140	+ # Prepare partition sample
	141	+ sample = {}
	142	+ sample_size = int(len(user.pkg_profile)*sample_proportion)
	143	+ for i in range(sample_size):
	144	+ key = random.choice(item_score.keys())
	145	+ sample[key] = item_score.pop(key)
	146	+ run_strategies(User(item_score),sample,n)
...	...