Commit 0c315647260ea6842ee5292149894f3ce97b8ea1

Authored by Tássia Camões Araújo
1 parent cee99fe3
Exists in master and in 1 other branch add_vagrant

Moved deprecated tests.

src/experiments/deprecated/k-suite.py 0 → 100755
... ... @@ -0,0 +1,186 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + k-suite - experiment different neighborhood sizes
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +def plot_roc(k,roc_points,log_file):
  35 + g = Gnuplot.Gnuplot()
  36 + g('set style data points')
  37 + g.xlabel('False Positive Rate')
  38 + g.ylabel('True Positive Rate')
  39 + g('set xrange [0:1.0]')
  40 + g('set yrange [0:1.0]')
  41 + g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
  42 + g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
  43 + Gnuplot.Data(roc_points))
  44 + g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
  45 + g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
  46 +
  47 +def plot_summary(precision,f05,mcc,log_file):
  48 + g = Gnuplot.Gnuplot()
  49 + g('set style data lines')
  50 + g.xlabel('Neighborhood (k)')
  51 + g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
  52 + g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
  53 + Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
  54 + Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
  55 + g.hardcopy(log_file+(".png"),terminal="png")
  56 + g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
  57 +
  58 +class ExperimentResults:
  59 + def __init__(self,repo_size):
  60 + self.repository_size = repo_size
  61 + self.precision = []
  62 + self.recall = []
  63 + self.fpr = []
  64 + self.f05 = []
  65 + self.mcc = []
  66 +
  67 + def add_result(self,ranking,sample):
  68 + predicted = RecommendationResult(dict.fromkeys(ranking,1))
  69 + real = RecommendationResult(sample)
  70 + evaluation = Evaluation(predicted,real,self.repository_size)
  71 + self.precision.append(evaluation.run(Precision()))
  72 + self.recall.append(evaluation.run(Recall()))
  73 + self.fpr.append(evaluation.run(FPR()))
  74 + self.f05.append(evaluation.run(F_score(0.5)))
  75 + self.mcc.append(evaluation.run(MCC()))
  76 +
  77 + def get_roc_point(self):
  78 + tpr = self.recall
  79 + fpr = self.fpr
  80 + if not tpr or not fpr:
  81 + return [0,0]
  82 + return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
  83 +
  84 + def get_precision_summary(self):
  85 + if not self.precision: return 0
  86 + return sum(self.precision)/len(self.precision)
  87 +
  88 + def get_f05_summary(self):
  89 + if not self.f05: return 0
  90 + return sum(self.f05)/len(self.f05)
  91 +
  92 + def get_mcc_summary(self):
  93 + if not self.mcc: return 0
  94 + return sum(self.mcc)/len(self.mcc)
  95 +
  96 +if __name__ == '__main__':
  97 + if len(sys.argv)<3:
  98 + print "Usage: k-suite strategy_str sample_file"
  99 + exit(1)
  100 + threshold = 20
  101 + iterations = 30
  102 + neighbors = [3,5,10,50,100,150,200,300,400,500]
  103 + cfg = Config()
  104 + cfg.strategy = sys.argv[1]
  105 + sample_file = sys.argv[2]
  106 + population_sample = []
  107 + with open(sample_file,'r') as f:
  108 + for line in f.readlines():
  109 + user_id = line.strip('\n')
  110 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  111 + # setup dictionaries and files
  112 + roc_summary = {}
  113 + recommended = {}
  114 + precision_summary = {}
  115 + f05_summary = {}
  116 + mcc_summary = {}
  117 + sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
  118 + if not os.path.exists(sample_dir):
  119 + os.makedirs(sample_dir)
  120 + log_file = os.path.join(sample_dir,cfg.strategy)
  121 + with open(log_file,'w') as f:
  122 + f.write("# %s\n\n" % sample_file.split('/')[-1])
  123 + f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
  124 + (cfg.strategy,threshold,iterations))
  125 + f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
  126 +
  127 + for k in neighbors:
  128 + roc_summary[k] = []
  129 + recommended[k] = set()
  130 + precision_summary[k] = []
  131 + f05_summary[k] = []
  132 + mcc_summary[k] = []
  133 + with open(log_file+"-k%.3d"%k,'w') as f:
  134 + f.write("# %s\n\n" % sample_file.split('/')[-1])
  135 + f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
  136 + f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
  137 +
  138 + # main loop per user
  139 + for submission_file in population_sample:
  140 + user = PopconSystem(submission_file)
  141 + user.filter_pkg_profile(cfg.pkgs_filter)
  142 + user.maximal_pkg_profile()
  143 + for k in neighbors:
  144 + cfg.k_neighbors = k
  145 + rec = Recommender(cfg)
  146 + repo_size = rec.items_repository.get_doccount()
  147 + results = ExperimentResults(repo_size)
  148 + # n iterations for same recommender and user
  149 + for n in range(iterations):
  150 + # Fill sample profile
  151 + profile_len = len(user.pkg_profile)
  152 + item_score = {}
  153 + for pkg in user.pkg_profile:
  154 + item_score[pkg] = user.item_score[pkg]
  155 + sample = {}
  156 + sample_size = int(profile_len*0.9)
  157 + for i in range(sample_size):
  158 + key = random.choice(item_score.keys())
  159 + sample[key] = item_score.pop(key)
  160 + iteration_user = User(item_score)
  161 + recommendation = rec.get_recommendation(iteration_user,threshold)
  162 + if hasattr(recommendation,"ranking"):
  163 + results.add_result(recommendation.ranking,sample)
  164 + recommended[k] = recommended[k].union(recommendation.ranking)
  165 + # save summary
  166 + roc_point = results.get_roc_point()
  167 + roc_summary[k].append(roc_point)
  168 + precision = results.get_precision_summary()
  169 + precision_summary[k].append(precision)
  170 + f05 = results.get_f05_summary()
  171 + f05_summary[k].append(f05)
  172 + mcc = results.get_mcc_summary()
  173 + mcc_summary[k].append(mcc)
  174 + with open(log_file+"-k%.3d"%k,'a') as f:
  175 + f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
  176 + (roc_point[0],roc_point[1],precision,f05,mcc))
  177 + # back to main flow
  178 + with open(log_file,'a') as f:
  179 + plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
  180 + for k in neighbors:
  181 + coverage = len(recommended[size])/float(repo_size)
  182 + f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
  183 + (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
  184 + float(sum(f05_summary[k]))/len(f05_summary[k]),
  185 + float(sum(mcc_summary[k]))/len(mcc_summary[k])))
  186 + plot_roc(k,roc_summary[k],log_file)
... ...
src/experiments/deprecated/strategies-suite.py 0 → 100755
... ... @@ -0,0 +1,274 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +
  33 +#iterations = 3
  34 +#sample_proportions = [0.9]
  35 +#weighting = [('bm25',1.2)]
  36 +#collaborative = ['knn']
  37 +#content_based = []
  38 +#hybrid = ['knnco']
  39 +#profile_size = [50,100]
  40 +#popcon_size = ["1000"]
  41 +#neighbors = [50]
  42 +
  43 +iterations = 10
  44 +sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
  45 +weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
  46 +content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  47 +collaborative = ['knn_eset','knn','knn_plus']
  48 +hybrid = ['knnco','knnco_eset']
  49 +
  50 +profile_size = range(20,100,20)
  51 +#popcon_size = [1000,10000,50000,'full']
  52 +neighbors = range(10,510,50)
  53 +
  54 +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
  55 + # Write recall log
  56 + output = open(("%s-%d" % (log_file,n)),'w')
  57 + output.write("# %s-n\n" % label["description"])
  58 + output.write("# %s-%d\n" % (label["values"],n))
  59 + output.write("\n%d %d %d\n" % \
  60 + (repo_size,profile_size,len(sample)))
  61 + if hasattr(recommendation,"ranking"):
  62 + notfound = []
  63 + ranks = []
  64 + for pkg in sample.keys():
  65 + if pkg in recommendation.ranking:
  66 + ranks.append(recommendation.ranking.index(pkg))
  67 + else:
  68 + notfound.append(pkg)
  69 + for r in sorted(ranks):
  70 + output.write(str(r)+"\n")
  71 + if notfound:
  72 + output.write("Out of recommendation:\n")
  73 + for pkg in notfound:
  74 + output.write(pkg+"\n")
  75 + output.close()
  76 +
  77 +def plot_summary(precision,recall,f1,f05,accuracy,log_file):
  78 + # Plot metrics summary
  79 + g = Gnuplot.Gnuplot()
  80 + g('set style data lines')
  81 + g.xlabel('Recommendation size')
  82 + g.title("Setup: %s" % log_file.split("/")[-1])
  83 + g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
  84 + Gnuplot.Data(precision,title="Precision"),
  85 + Gnuplot.Data(recall,title="Recall"),
  86 + Gnuplot.Data(f1,title="F_1"),
  87 + Gnuplot.Data(f05,title="F_0.5"))
  88 + g.hardcopy(log_file+".png",terminal="png")
  89 + g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
  90 + g('set logscale x')
  91 + g('replot')
  92 + g.hardcopy(log_file+"-logscale.png",terminal="png")
  93 + g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
  94 +
  95 +def get_label(cfg,sample_proportion):
  96 + label = {}
  97 + if cfg.strategy in content_based:
  98 + label["description"] = "strategy-filter-profile-k1_bm25-sample"
  99 + label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
  100 + (cfg.strategy,cfg.profile_size,
  101 + cfg.pkgs_filter.split("/")[-1],
  102 + cfg.bm25_k1,sample_proportion))
  103 + elif cfg.strategy in collaborative:
  104 + label["description"] = "strategy-knn-filter-k1_bm25-sample"
  105 + label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
  106 + (cfg.strategy,cfg.k_neighbors,
  107 + cfg.pkgs_filter.split("/")[-1],
  108 + cfg.bm25_k1,sample_proportion))
  109 + elif cfg.strategy in hybrid:
  110 + label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
  111 + label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
  112 + (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
  113 + cfg.pkgs_filter.split("/")[-1],
  114 + cfg.bm25_k1,sample_proportion))
  115 + else:
  116 + print "Unknown strategy"
  117 + return label
  118 +
  119 +class ExperimentResults:
  120 + def __init__(self,repo_size):
  121 + self.repository_size = repo_size
  122 + self.accuracy = {}
  123 + self.precision = {}
  124 + self.recall = {}
  125 + self.f1 = {}
  126 + self.f05 = {}
  127 + points = [1]+range(10,200,10)+range(200,self.repository_size,100)
  128 + for size in points:
  129 + self.accuracy[size] = []
  130 + self.precision[size] = []
  131 + self.recall[size] = []
  132 + self.f1[size] = []
  133 + self.f05[size] = []
  134 +
  135 + def add_result(self,ranking,sample):
  136 + for size in self.accuracy.keys():
  137 + predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
  138 + real = RecommendationResult(sample)
  139 + evaluation = Evaluation(predicted,real,self.repository_size)
  140 + self.accuracy[size].append(evaluation.run(Accuracy()))
  141 + self.precision[size].append(evaluation.run(Precision()))
  142 + self.recall[size].append(evaluation.run(Recall()))
  143 + self.f1[size].append(evaluation.run(F_score(1)))
  144 + self.f05[size].append(evaluation.run(F_score(0.5)))
  145 +
  146 + def get_precision_summary(self):
  147 + summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
  148 + return sorted(summary)
  149 +
  150 + def get_recall_summary(self):
  151 + summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
  152 + return sorted(summary)
  153 +
  154 + def get_f1_summary(self):
  155 + summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
  156 + return sorted(summary)
  157 +
  158 + def get_f05_summary(self):
  159 + summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
  160 + return sorted(summary)
  161 +
  162 + def get_accuracy_summary(self):
  163 + summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
  164 + return sorted(summary)
  165 +
  166 + def best_precision(self):
  167 + size = max(self.precision, key = lambda x: max(self.precision[x]))
  168 + return (size,max(self.precision[size]))
  169 +
  170 + def best_f1(self):
  171 + size = max(self.f1, key = lambda x: max(self.f1[x]))
  172 + return (size,max(self.f1[size]))
  173 +
  174 + def best_f05(self):
  175 + size = max(self.f05, key = lambda x: max(self.f05[x]))
  176 + return (size,max(self.f05[size]))
  177 +
  178 +def run_strategy(cfg,user):
  179 + for weight in weighting:
  180 + cfg.weight = weight[0]
  181 + cfg.bm25_k1 = weight[1]
  182 + rec = Recommender(cfg)
  183 + repo_size = rec.items_repository.get_doccount()
  184 + for proportion in sample_proportions:
  185 + results = ExperimentResults(repo_size)
  186 + label = get_label(cfg,proportion)
  187 + log_file = "results/strategies/"+label["values"]
  188 + for n in range(iterations):
  189 + # Fill sample profile
  190 + profile_size = len(user.pkg_profile)
  191 + item_score = {}
  192 + for pkg in user.pkg_profile:
  193 + item_score[pkg] = user.item_score[pkg]
  194 + sample = {}
  195 + sample_size = int(profile_size*proportion)
  196 + for i in range(sample_size):
  197 + key = random.choice(item_score.keys())
  198 + sample[key] = item_score.pop(key)
  199 + iteration_user = User(item_score)
  200 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  201 + write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
  202 + if hasattr(recommendation,"ranking"):
  203 + results.add_result(recommendation.ranking,sample)
  204 + with open(log_file,'w') as f:
  205 + precision_10 = sum(results.precision[10])/len(results.precision[10])
  206 + f1_10 = sum(results.f1[10])/len(results.f1[10])
  207 + f05_10 = sum(results.f05[10])/len(results.f05[10])
  208 + f.write("# %s\n# %s\n\ncoverage %d\n\n" %
  209 + (label["description"],label["values"],recommendation.size))
  210 + f.write("# best results (recommendation size; metric)\n")
  211 + f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
  212 + (results.best_precision()[0],results.best_precision()[1],
  213 + results.best_f1()[0],results.best_f1()[1],
  214 + results.best_f05()[0],results.best_f05()[1]))
  215 + f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
  216 + (precision_10,f1_10,f05_10))
  217 + precision = results.get_precision_summary()
  218 + recall = results.get_recall_summary()
  219 + f1 = results.get_f1_summary()
  220 + f05 = results.get_f05_summary()
  221 + accuracy = results.get_accuracy_summary()
  222 + plot_summary(precision,recall,f1,f05,accuracy,log_file)
  223 +
  224 +def run_content(user,cfg):
  225 + for strategy in content_based:
  226 + cfg.strategy = strategy
  227 + for size in profile_size:
  228 + cfg.profile_size = size
  229 + run_strategy(cfg,user)
  230 +
  231 +def run_collaborative(user,cfg):
  232 + popcon_desktopapps = cfg.popcon_desktopapps
  233 + popcon_programs = cfg.popcon_programs
  234 + for strategy in collaborative:
  235 + cfg.strategy = strategy
  236 + for k in neighbors:
  237 + cfg.k_neighbors = k
  238 + #for size in popcon_size:
  239 + # if size:
  240 + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
  241 + # cfg.popcon_programs = popcon_programs+"_"+size
  242 + run_strategy(cfg,user)
  243 +
  244 +def run_hybrid(user,cfg):
  245 + popcon_desktopapps = cfg.popcon_desktopapps
  246 + popcon_programs = cfg.popcon_programs
  247 + for strategy in hybrid:
  248 + cfg.strategy = strategy
  249 + for k in neighbors:
  250 + cfg.k_neighbors = k
  251 + #for size in popcon_size:
  252 + # if size:
  253 + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
  254 + # cfg.popcon_programs = popcon_programs+"_"+size
  255 + for size in profile_size:
  256 + cfg.profile_size = size
  257 + run_strategy(cfg,user)
  258 +
  259 +if __name__ == '__main__':
  260 + #user = LocalSystem()
  261 + #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
  262 +
  263 + cfg = Config()
  264 + user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
  265 + #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
  266 + user.filter_pkg_profile(cfg.pkgs_filter)
  267 + user.maximal_pkg_profile()
  268 +
  269 + if "content" in sys.argv or len(sys.argv)<2:
  270 + run_content(user,cfg)
  271 + if "collaborative" in sys.argv or len(sys.argv)<2:
  272 + run_collaborative(user,cfg)
  273 + if "hybrid" in sys.argv or len(sys.argv)<2:
  274 + run_hybrid(user,cfg)
... ...