Commit 79e91d8c5e76016236e6af59f00a1fbbb813b390

Authored by Tássia Camões Araújo
1 parent 1aed15a5
Exists in master and in 1 other branch add_vagrant

Experiments refactoring.

src/evaluation.py
@@ -294,6 +294,10 @@ class CrossValidation: @@ -294,6 +294,10 @@ class CrossValidation:
294 round_user = User(cross_item_score) 294 round_user = User(cross_item_score)
295 result_size = int(self.recommender.items_repository.get_doccount()* 295 result_size = int(self.recommender.items_repository.get_doccount()*
296 self.result_proportion) 296 self.result_proportion)
  297 + logging.debug("size %d" % result_size)
  298 + if not result_size:
  299 + logging.critical("Recommendation size is zero.")
  300 + raise Error
297 predicted_result = self.recommender.get_recommendation(round_user,result_size) 301 predicted_result = self.recommender.get_recommendation(round_user,result_size)
298 if not predicted_result.size: 302 if not predicted_result.size:
299 logging.critical("No recommendation produced. Abort cross-validation.") 303 logging.critical("No recommendation produced. Abort cross-validation.")
src/experiments/strategies-suite.py
@@ -30,121 +30,117 @@ import logging @@ -30,121 +30,117 @@ import logging
30 import random 30 import random
31 import Gnuplot 31 import Gnuplot
32 32
33 -def run_iteration(label,cfg,sample_proportion,n): 33 +def write_recall_log(label,sample,recommendation,log_file):
  34 + # Write recall log
  35 + output = open(log_file,'w')
  36 + output.write("# %s\n" % label["description"])
  37 + output.write("# %s\n" % label["values"])
  38 + notfound = []
  39 + ranks = []
  40 + for pkg in sample.keys():
  41 + if pkg in recommendation.ranking:
  42 + ranks.append(recommendation.ranking.index(pkg))
  43 + else:
  44 + notfound.append(pkg)
  45 + for r in sorted(ranks):
  46 + output.write(str(r)+"\n")
  47 + if notfound:
  48 + output.write("Out of recommendation:\n")
  49 + for pkg in notfound:
  50 + output.write(pkg+"\n")
  51 + output.close()
  52 +
  53 +def plot_summary(sample,recommendation,repo_size,log_file):
  54 + # Plot metrics summary
  55 + accuracy = []
  56 + precision = []
  57 + recall = []
  58 + f1 = []
  59 + g = Gnuplot.Gnuplot()
  60 + g('set style data lines')
  61 + g.xlabel('Recommendation size')
  62 + for size in range(1,len(recommendation.ranking)+1,100):
  63 + predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
  64 + real = RecommendationResult(sample)
  65 + evaluation = Evaluation(predicted,real,repo_size)
  66 + accuracy.append([size,evaluation.run(Accuracy())])
  67 + precision.append([size,evaluation.run(Precision())])
  68 + recall.append([size,evaluation.run(Recall())])
  69 + f1.append([size,evaluation.run(F1())])
  70 +
  71 + g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
  72 + Gnuplot.Data(precision,title="Precision"),
  73 + Gnuplot.Data(recall,title="Recall"),
  74 + Gnuplot.Data(f1,title="F1"))
  75 + g.hardcopy(log_file+"-plot.ps", terminal="postscript")
  76 + g.hardcopy(log_file+"-plot.ps", terminal="postscript")
  77 +
  78 +def run_iteration(user,cfg,label,sample):
34 rec = Recommender(cfg) 79 rec = Recommender(cfg)
35 repo_size = rec.items_repository.get_doccount() 80 repo_size = rec.items_repository.get_doccount()
36 - user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters,"desktop"))  
37 - print "profile",user.pkg_profile  
38 - user.maximal_pkg_profile()  
39 - sample_size = int(len(user.pkg_profile)*sample_proportion)  
40 - for n in range(iteration):  
41 - item_score = dict.fromkeys(user.pkg_profile,1)  
42 - # Prepare partition  
43 - sample = {}  
44 - for i in range(sample_size):  
45 - key = random.choice(item_score.keys())  
46 - sample[key] = item_score.pop(key)  
47 - # Get full recommendation  
48 - user = User(item_score)  
49 - recommendation = rec.get_recommendation(user,repo_size)  
50 - # Write recall log  
51 - log_file = "results/strategies/"+label["values"]  
52 - output = open(log_file,'w')  
53 - output.write("# %s\n" % label["description"])  
54 - output.write("# %s\n" % label["values"])  
55 - notfound = []  
56 - ranks = []  
57 - for pkg in sample.keys():  
58 - if pkg in recommendation.ranking:  
59 - ranks.append(recommendation.ranking.index(pkg))  
60 - else:  
61 - notfound.append(pkg)  
62 - for r in sorted(ranks):  
63 - output.write(str(r)+"\n")  
64 - if notfound:  
65 - output.write("Out of recommendation:\n")  
66 - for pkg in notfound:  
67 - output.write(pkg+"\n")  
68 - output.close()  
69 - # Plot metrics summary  
70 - accuracy = []  
71 - precision = []  
72 - recall = []  
73 - f1 = []  
74 - g = Gnuplot.Gnuplot()  
75 - g('set style data lines')  
76 - g.xlabel('Recommendation size')  
77 - for size in range(1,len(recommendation.ranking)+1,100):  
78 - predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))  
79 - real = RecommendationResult(sample)  
80 - evaluation = Evaluation(predicted,real,repo_size)  
81 - accuracy.append([size,evaluation.run(Accuracy())])  
82 - precision.append([size,evaluation.run(Precision())])  
83 - recall.append([size,evaluation.run(Recall())])  
84 - f1.append([size,evaluation.run(F1())])  
85 -  
86 - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),  
87 - Gnuplot.Data(precision,title="Precision"),  
88 - Gnuplot.Data(recall,title="Recall"),  
89 - Gnuplot.Data(f1,title="F1"))  
90 - g.hardcopy(log_file+"-plot.ps", enhanced=1, color=1) 81 + recommendation = rec.get_recommendation(user,repo_size)
  82 + log_file = "results/strategies/"+label["values"]
  83 + write_recall_log(label,sample,recommendation,log_file)
  84 + plot_summary(sample,recommendation,repo_size,log_file)
91 85
  86 +def run_strategies(user,sample,n):
  87 + cfg = Config()
  88 + label = {}
  89 + sample_proportion = (len(sample)/len(user.pkg_profile)+len(sample))
  90 + for k in bm25_k1:
  91 + cfg.bm25_k1 = k
  92 + if "content" in sys.argv or len(sys.argv)<2:
  93 + for size in profile_size:
  94 + cfg.profile_size = size
  95 + for strategy in content_based:
  96 + cfg.strategy = strategy
  97 + label["description"] = "k1_bm25-profile-strategy-sample-n"
  98 + label["values"] = ("%.2f-%d-%s-%.2f-%d" %
  99 + (cfg.bm25_k1,cfg.profile_size,
  100 + cfg.strategy,sample_proportion,n))
  101 + run_iteration(user,cfg,label,sample)
  102 + if "colaborative" in sys.argv or len(sys.argv)<2:
  103 + for strategy in collaborative:
  104 + cfg.strategy = strategy
  105 + for size in popcon_size:
  106 + cfg.popcon_desktopapps = cfg.popcon_desktopapps+size
  107 + cfg.popcon_programs = cfg.popcon_programs+size
  108 + for k in neighbors:
  109 + cfg.k_neighbors = k
  110 + k_str = "k"+str(cfg.k_neighbors)
  111 + label["description"] = "k1_bm25-popcon-strategy-k-sample-n"
  112 + label["values"] = ("%.2f-%s-%s-%s-%.2f-%d" %
  113 + (cfg.bm25_k1,str(popcon_size),cfg.strategy,
  114 + k_str,sample_proportion,n))
  115 + run_iteration(user,cfg,label,sample)
92 116
93 if __name__ == '__main__': 117 if __name__ == '__main__':
94 - iteration = 10 118 + iterations = 10
95 samples_proportion = [0.5, 0.6, 0.7, 0.8, 0.9] 119 samples_proportion = [0.5, 0.6, 0.7, 0.8, 0.9]
96 weights = ['bm25', 'trad'] 120 weights = ['bm25', 'trad']
97 - cb_strategies = ['cb','cbt','cbd']  
98 - #cb_strategies = []  
99 - profile_size = range(10,100,10)  
100 - items_repository = ["data/AppAxi","/var/lib/apt-xapian-index/index"]  
101 - users_repository = ["data/popcon_index_full","data/popcon_index-50000",  
102 - "data/popcon_index_10000","data/popcon_index_1000"]  
103 - users_repository = []  
104 - neighbors = range(10,1010,100) 121 + bm25_k1 = [1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
  122 + content_based = ['cb','cbt','cbd','cbh',
  123 + 'cb_eset','cbt_eset','cbd_eset','cbh_eset']
  124 + collaborative = ['knn','knn_plus','knn_eset']
  125 + hybrid = ['knnco','knnco_eset']
105 126
106 - cfg = Config()  
107 - cfg.index_mode = "old"  
108 - label = {} 127 + profile_size = range(10,100,10)
  128 + popcon_size = [1000,10000,50000,'full']
  129 + neighbors = range(10,510,100)
109 130
110 - for w in weights:  
111 - cfg.weight = w  
112 - for items_repo in items_repository:  
113 - cfg.axi = items_repo  
114 - if "App" in cfg.axi:  
115 - axi_str = "axiapp"  
116 - else:  
117 - axi_str = "axifull"  
118 - for sample_proportion in samples_proportion:  
119 - if "content" in sys.argv or len(sys.argv)<2:  
120 - for size in profile_size:  
121 - cfg.profile_size = size  
122 - for strategy in cb_strategies:  
123 - cfg.strategy = strategy  
124 - for n in range(iteration):  
125 - label["description"] = "weight-axi-profile-strategy-sample-n"  
126 - label["values"] = ("%s-%s-%d-%s-%.2f-%d" %  
127 - (cfg.weight,axi_str,cfg.profile_size,  
128 - cfg.strategy,sample_proportion,n))  
129 - run_iteration(label,cfg,sample_proportion,n)  
130 - if "colaborative" in sys.argv or len(sys.argv)<2:  
131 - cfg.strategy = "col"  
132 - for users_repo in users_repository:  
133 - cfg.popcon_index = users_repo  
134 - for k in neighbors:  
135 - cfg.k_neighbors = k  
136 - for n in range(iteration):  
137 - k_str = "k"+str(cfg.k_neighbors)  
138 - if "full" in cfg.popcon_index:  
139 - popcon_str = "popfull"  
140 - if "50000" in cfg.popcon_index:  
141 - popcon_str = "pop50000"  
142 - if "10000" in cfg.popcon_index:  
143 - popcon_str = "pop10000"  
144 - if "1000" in cfg.popcon_index:  
145 - popcon_str = "pop1000"  
146 - label["description"] = "weight-axi-popcon-profile-strategy-k-sample-n"  
147 - label["values"] = ("%s-%s-%s-%d-%s-%s-%.2f-%d" %  
148 - (cfg.weight,axi_str,popcon_str,cfg.profile_size,  
149 - cfg.strategy,k_str,sample_proportion,n))  
150 - run_iteration(label,cfg,sample_proportion,n) 131 + user = LocalSystem()
  132 + #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
  133 + user.maximal_pkg_profile()
  134 + for sample_proportion in samples_proportion:
  135 + for n in range(iterations):
  136 + # Fill user profile
  137 + item_score = {}
  138 + for pkg in user.pkg_profile:
  139 + item_score[pkg] = user.item_score[pkg]
  140 + # Prepare partition sample
  141 + sample = {}
  142 + sample_size = int(len(user.pkg_profile)*sample_proportion)
  143 + for i in range(sample_size):
  144 + key = random.choice(item_score.keys())
  145 + sample[key] = item_score.pop(key)
  146 + run_strategies(User(item_score),sample,n)