Commit dc8ededf327ee77c4bf11a528452c1f61c260f2e

Authored by Tássia Camões Araújo
1 parent 78a934e4
Exists in master and in 1 other branch add_vagrant

Updated experiments.

src/experiments/k-suite.py 0 → 100755
... ... @@ -0,0 +1,152 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +def plot_roc(p,roc_points,log_file):
  35 + g = Gnuplot.Gnuplot()
  36 + g('set style data points')
  37 + g.xlabel('False Positive Rate')
  38 + g.ylabel('True Positive Rate')
  39 + g('set xrange [0:1.0]')
  40 + g('set yrange [0:1.0]')
  41 + g.title("Setup: %s" % log_file.split("/")[-1])
  42 + g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
  43 + Gnuplot.Data(roc_points,title="k %d"%k))
  44 + g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
  45 + g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
  46 +
  47 +class ExperimentResults:
  48 + def __init__(self,repo_size):
  49 + self.repository_size = repo_size
  50 + self.precision = []
  51 + self.recall = []
  52 + self.fpr = []
  53 +
  54 + def add_result(self,ranking,sample):
  55 + predicted = RecommendationResult(dict.fromkeys(ranking,1))
  56 + real = RecommendationResult(sample)
  57 + evaluation = Evaluation(predicted,real,self.repository_size)
  58 + self.precision.append(evaluation.run(Precision()))
  59 + self.recall.append(evaluation.run(Recall()))
  60 + self.fpr.append(evaluation.run(FPR()))
  61 +
  62 + # Average ROC by threshold (whici is the size)
  63 + def get_roc_point(self):
  64 + tpr = self.recall
  65 + fpr = self.fpr
  66 + return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
  67 +
  68 + def get_precision_summary(self):
  69 + return sum(self.precision)/len(self.precision)
  70 +
  71 + def get_recall_summary(self):
  72 + return sum(self.recall)/len(self.recall)
  73 +
  74 +if __name__ == '__main__':
  75 + # experiment parameters
  76 + threshold = 20
  77 + iterations = 30
  78 + sample_file = "results/misc-popcon/sample-050-100"
  79 + neighbors = [3,5,10,50,100,150,200,300,400,500]
  80 + cfg = Config()
  81 + cfg.strategy = "knn"
  82 + print cfg.popcon_index
  83 + sample = []
  84 + with open(sample_file,'r') as f:
  85 + for line in f.readlines():
  86 + user_id = line.strip('\n')
  87 + sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  88 + # setup dictionaries and files
  89 + roc_points = {}
  90 + recommended = {}
  91 + precisions = {}
  92 + aucs = {}
  93 + log_file = "results/k-suite/sample-050-100/%s" % (cfg.strategy)
  94 + for k in neighbors:
  95 + roc_points[k] = []
  96 + recommended[k] = set()
  97 + precisions[k] = []
  98 + aucs[k] = []
  99 + with open(log_file+"-k%.3d"%k,'w') as f:
  100 + f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
  101 + f.write("# roc_point \tp(20) \tauc\n\n")
  102 + # main loop per user
  103 + for submission_file in sample:
  104 + user = PopconSystem(submission_file)
  105 + user.filter_pkg_profile(cfg.pkgs_filter)
  106 + user.maximal_pkg_profile()
  107 + for k in neighbors:
  108 + cfg.k_neighbors = k
  109 + rec = Recommender(cfg)
  110 + repo_size = rec.items_repository.get_doccount()
  111 + results = ExperimentResults(repo_size)
  112 + # n iterations for same recommender and user
  113 + for n in range(iterations):
  114 + # Fill sample profile
  115 + profile_size = len(user.pkg_profile)
  116 + item_score = {}
  117 + for pkg in user.pkg_profile:
  118 + item_score[pkg] = user.item_score[pkg]
  119 + sample = {}
  120 + sample_size = int(profile_size*0.9)
  121 + for i in range(sample_size):
  122 + key = random.choice(item_score.keys())
  123 + sample[key] = item_score.pop(key)
  124 + iteration_user = User(item_score)
  125 + recommendation = rec.get_recommendation(iteration_user,threshold)
  126 + if hasattr(recommendation,"ranking"):
  127 + results.add_result(recommendation.ranking,sample)
  128 + print "ranking",recommendation.ranking
  129 + print "recommended_%d"%k,recommended[k]
  130 + recommended[k] = recommended[k].union(recommendation.ranking)
  131 + print recommended[k]
  132 + # save summary
  133 + roc_point = results.get_roc_point()
  134 + auc = numpy.trapz(y=[0,roc_point[1],1],x=[0,roc_point[0],1])
  135 + p_20 = results.get_precision_summary()
  136 + roc_points[k].append(roc_point)
  137 + aucs[k].append(auc)
  138 + precisions[k].append(p_20)
  139 + with open(log_file+"-k%.3d"%k,'a') as f:
  140 + f.write("%s \t%.2f \t%.4f\n" % (str(roc_point),p_20,auc))
  141 + # back to main flow
  142 + with open(log_file,'w') as f:
  143 + f.write("# k coverage \tp(20) \tauc\n\n")
  144 + for k in neighbors:
  145 + print "len_recommended_%d"%k,len(recommended[k])
  146 + print "repo_size",repo_size
  147 + coverage = len(recommended[k])/float(repo_size)
  148 + print coverage
  149 + f.write("%d \t%.2f \t%.2f \t%.2fi\n" %
  150 + (k,coverage,float(sum(precisions[k]))/len(precisions[k]),
  151 + float(sum(aucs[k]))/len(aucs[k])))
  152 + plot_roc(k,roc_points[k],log_file)
... ...
src/experiments/popcon-population.py 0 → 100755
... ... @@ -0,0 +1,74 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + misc_popcon - misc experiments with popcon data
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import Gnuplot
  23 +import xapian
  24 +import os
  25 +import random
  26 +import sys
  27 +
  28 +def get_population_profile(popcon):
  29 + profiles_size = []
  30 + for n in range(1,popcon.get_doccount()):
  31 + user = popcon.get_document(n)
  32 + pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")]
  33 + if len(pkgs_profile)<10:
  34 + print "-- profile<10:",user.get_data()
  35 + profiles_size.append(len(pkgs_profile))
  36 + max_profile = max(profiles_size)
  37 + population_profile = [(n,profiles_size.count(n))
  38 + for n in range(max_profile+1)
  39 + if profiles_size.count(n)>0 ]
  40 + return population_profile,max_profile
  41 +
  42 +def get_profile_ranges(population_profile,max_profile,popcon_size):
  43 + ranges = range(0,251,50)
  44 + ranges.append(max_profile)
  45 + ranges_population = []
  46 + ranges_percentage = []
  47 + for maximum in ranges[1:]:
  48 + minimum = ranges[ranges.index(maximum)-1]
  49 + valid = [x[1] for x in population_profile
  50 + if x[0]>minimum and x[0]<=maximum]
  51 + ranges_population.append((maximum,sum(valid)))
  52 + ranges_percentage.append((maximum,sum(valid)/float(popcon_size)))
  53 + return ranges_population,ranges_percentage
  54 +
  55 +def plot(data,xlabel,ylabel,output):
  56 + g = Gnuplot.Gnuplot()
  57 + g('set style data points')
  58 + g.xlabel(xlabel)
  59 + g.ylabel(ylabel)
  60 + g.plot(data)
  61 + g.hardcopy(output+".png", terminal="png")
  62 + g.hardcopy(output+".ps", terminal="postscript", enhanced=1, color=1)
  63 +
  64 +if __name__ == '__main__':
  65 + popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps"))
  66 + print ("Popcon repository size: %d" % popcon.get_doccount())
  67 +
  68 + profile_population,max_profile = get_population_profile(popcon)
  69 + ranges_population,ranges_percentage = get_profile_ranges(profile_population,
  70 + max_profile,popcon.get_doccount())
  71 + print "Population per profile range (up to index)"
  72 + print ranges_population
  73 + plot(profile_population,"Desktop profile size","Population size",
  74 + "results/misc-popcon/profile_population")
... ...
src/experiments/roc-suite.py 0 → 100755
... ... @@ -0,0 +1,328 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +#iterations = 3
  35 +#sample_proportions = [0.9]
  36 +#weighting = [('bm25',1.2)]
  37 +#collaborative = ['knn_eset']
  38 +#content_based = ['cb']
  39 +#hybrid = ['knnco']
  40 +#profile_size = [50,100]
  41 +#popcon_size = ["1000"]
  42 +#neighbors = [50]
  43 +
  44 +iterations = 30
  45 +sample_proportions = [0.9]
  46 +weighting = [('bm25',1.0),('bm25',1.2),('bm25',2.0),('trad',0)]
  47 +content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  48 +collaborative = ['knn_eset','knn','knn_plus']
  49 +hybrid = ['knnco','knnco_eset']
  50 +profile_size = range(20,200,20)
  51 +neighbors = range(10,510,50)
  52 +
  53 +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
  54 + # Write recall log
  55 + output = open(("%s-%.2d" % (log_file,n)),'w')
  56 + output.write("# %s-n\n" % label["description"])
  57 + output.write("# %s-%.2d\n" % (label["values"],n))
  58 + output.write("\n# repository profile sample\n%d %d %d\n" % \
  59 + (repo_size,profile_size,len(sample)))
  60 + if hasattr(recommendation,"ranking"):
  61 + notfound = []
  62 + ranks = []
  63 + for pkg in sample.keys():
  64 + if pkg in recommendation.ranking:
  65 + ranks.append(recommendation.ranking.index(pkg))
  66 + else:
  67 + notfound.append(pkg)
  68 + for r in sorted(ranks):
  69 + output.write(str(r)+"\n")
  70 + if notfound:
  71 + output.write("# out of recommendation:\n")
  72 + for pkg in notfound:
  73 + output.write(pkg+"\n")
  74 + output.close()
  75 +
  76 +def plot_roc(roc_points,auc,eauc,c,p,log_file):
  77 + g = Gnuplot.Gnuplot()
  78 + g('set style data lines')
  79 + g.xlabel('False Positive Rate')
  80 + g.ylabel('True Positive Rate')
  81 + g('set xrange [0:1.0]')
  82 + g('set yrange [0:1.0]')
  83 + g.title("Setup: %s" % log_file.split("/")[-1])
  84 + g('set label "C %.2f" at 0.8,0.25' % c)
  85 + g('set label "P(20) %.2f" at 0.8,0.2' % p)
  86 + g('set label "AUC %.4f" at 0.8,0.15' % auc)
  87 + g('set label "EAUC %.4f" at 0.8,0.1' % eauc)
  88 + g.plot(Gnuplot.Data(roc_points,title="ROC"),
  89 + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
  90 + Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
  91 + g.hardcopy(log_file+"-roc.png",terminal="png")
  92 + g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
  93 +
  94 +def plot_summary(precision,recall,f1,f05,accuracy,log_file):
  95 + # Plot metrics summary
  96 + g = Gnuplot.Gnuplot()
  97 + g('set style data lines')
  98 + g.xlabel('Recommendation size')
  99 + g.title("Setup: %s" % log_file.split("/")[-1])
  100 + g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
  101 + Gnuplot.Data(precision,title="Precision"),
  102 + Gnuplot.Data(recall,title="Recall"),
  103 + Gnuplot.Data(f1,title="F_1"),
  104 + Gnuplot.Data(f05,title="F_0.5"))
  105 + g.hardcopy(log_file+".png",terminal="png")
  106 + g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
  107 + g('set logscale x')
  108 + g('replot')
  109 + g.hardcopy(log_file+"-logscale.png",terminal="png")
  110 + g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
  111 +
  112 +def get_label(cfg,sample_proportion):
  113 + label = {}
  114 + if cfg.strategy in content_based:
  115 + label["description"] = "strategy-filter-profile-k1_bm25"
  116 + label["values"] = ("%s-profile%.3d-%s-kbm%.1f" %
  117 + (cfg.strategy,cfg.profile_size,
  118 + cfg.pkgs_filter.split("/")[-1],
  119 + cfg.bm25_k1))
  120 + elif cfg.strategy in collaborative:
  121 + label["description"] = "strategy-knn-filter-k1_bm25"
  122 + label["values"] = ("%s-k%.3d-%s-kbm%.1f" %
  123 + (cfg.strategy,cfg.k_neighbors,
  124 + cfg.pkgs_filter.split("/")[-1],
  125 + cfg.bm25_k1))
  126 + elif cfg.strategy in hybrid:
  127 + label["description"] = "strategy-knn-filter-profile-k1_bm25"
  128 + label["values"] = ("%s-k%.3d-profile%.3d-%s-kbm%.1f" %
  129 + (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
  130 + cfg.pkgs_filter.split("/")[-1],
  131 + cfg.bm25_k1))
  132 + else:
  133 + print "Unknown strategy"
  134 + return label
  135 +
  136 +class ExperimentResults:
  137 + def __init__(self,repo_size):
  138 + self.repository_size = repo_size
  139 + self.accuracy = {}
  140 + self.precision = {}
  141 + self.recall = {}
  142 + self.f1 = {}
  143 + self.f05 = {}
  144 + self.fpr = {}
  145 + #points = [1]+range(10,200,10)+range(200,self.repository_size,100)
  146 + points = [1]+range(10,self.repository_size,10)
  147 + self.recommended = set()
  148 + for size in points:
  149 + self.accuracy[size] = []
  150 + self.precision[size] = []
  151 + self.recall[size] = []
  152 + self.f1[size] = []
  153 + self.f05[size] = []
  154 + self.fpr[size] = []
  155 +
  156 + def add_result(self,ranking,sample):
  157 + print "len_recommended", len(self.recommended)
  158 + print "len_rank", len(ranking)
  159 + self.recommended = self.recommended.union(ranking)
  160 + print "len_recommended", len(self.recommended)
  161 + # get data only for point
  162 + for size in self.accuracy.keys():
  163 + predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
  164 + real = RecommendationResult(sample)
  165 + evaluation = Evaluation(predicted,real,self.repository_size)
  166 + #self.accuracy[size].append(evaluation.run(Accuracy()))
  167 + self.precision[size].append(evaluation.run(Precision()))
  168 + self.recall[size].append(evaluation.run(Recall()))
  169 + #self.f1[size].append(evaluation.run(F_score(1)))
  170 + #self.f05[size].append(evaluation.run(F_score(0.5)))
  171 + self.fpr[size].append(evaluation.run(FPR()))
  172 +
  173 + # Average ROC by threshold (whici is the size)
  174 + def get_roc_points(self):
  175 + points = []
  176 + for size in self.recall.keys():
  177 + tpr = self.recall[size]
  178 + fpr = self.fpr[size]
  179 + points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
  180 + return sorted(points)
  181 +
  182 + def get_precision_summary(self):
  183 + summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
  184 + return sorted(summary)
  185 +
  186 + def get_recall_summary(self):
  187 + summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
  188 + return sorted(summary)
  189 +
  190 + def get_f1_summary(self):
  191 + summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
  192 + return sorted(summary)
  193 +
  194 + def get_f05_summary(self):
  195 + summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
  196 + return sorted(summary)
  197 +
  198 + def get_accuracy_summary(self):
  199 + summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
  200 + return sorted(summary)
  201 +
  202 + def best_precision(self):
  203 + size = max(self.precision, key = lambda x: max(self.precision[x]) and x>10)
  204 + return (size,max(self.precision[size]))
  205 +
  206 + def best_f1(self):
  207 + size = max(self.f1, key = lambda x: max(self.f1[x]))
  208 + return (size,max(self.f1[size]))
  209 +
  210 + def best_f05(self):
  211 + size = max(self.f05, key = lambda x: max(self.f05[x]))
  212 + return (size,max(self.f05[size]))
  213 +
  214 +def run_strategy(cfg,user):
  215 + for weight in weighting:
  216 + cfg.weight = weight[0]
  217 + cfg.bm25_k1 = weight[1]
  218 + rec = Recommender(cfg)
  219 + repo_size = rec.items_repository.get_doccount()
  220 + for proportion in sample_proportions:
  221 + results = ExperimentResults(repo_size)
  222 + label = get_label(cfg,proportion)
  223 + #log_file = "results/20110906/4a67a295/"+label["values"]
  224 + log_file = "results/"+label["values"]
  225 + for n in range(iterations):
  226 + # Fill sample profile
  227 + profile_size = len(user.pkg_profile)
  228 + item_score = {}
  229 + for pkg in user.pkg_profile:
  230 + item_score[pkg] = user.item_score[pkg]
  231 + sample = {}
  232 + sample_size = int(profile_size*proportion)
  233 + for i in range(sample_size):
  234 + key = random.choice(item_score.keys())
  235 + sample[key] = item_score.pop(key)
  236 + iteration_user = User(item_score)
  237 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  238 + #write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
  239 + if hasattr(recommendation,"ranking"):
  240 + results.add_result(recommendation.ranking,sample)
  241 + with open(log_file,'w') as f:
  242 + roc_points = results.get_roc_points()
  243 + x_coord = [p[0] for p in roc_points]
  244 + y_coord = [p[1] for p in roc_points]
  245 + auc = numpy.trapz(y=y_coord, x=x_coord)
  246 + eauc = (auc+
  247 + numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
  248 + numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
  249 + precision_20 = sum(results.precision[10])/len(results.precision[10])
  250 + print results.recommended
  251 + print "len",len(results.recommended)
  252 + coverage = len(results.recommended)/float(repo_size)
  253 + print "repo_size: ", float(repo_size)
  254 + print coverage
  255 + exit(1)
  256 + #f1_10 = sum(results.f1[10])/len(results.f1[10])
  257 + #f05_10 = sum(results.f05[10])/len(results.f05[10])
  258 + f.write("# %s\n# %s\n\n" %
  259 + (label["description"],label["values"]))
  260 + f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
  261 + (coverage,precision_20,auc,eauc))
  262 + #f.write("# best results (recommendation size; metric)\n")
  263 + #f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
  264 + # (results.best_precision()[0],results.best_precision()[1],
  265 + # results.best_f1()[0],results.best_f1()[1],
  266 + # results.best_f05()[0],results.best_f05()[1]))
  267 + #f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
  268 + # (precision_10,f1_10,f05_10))
  269 + #precision = results.get_precision_summary()
  270 + #recall = results.get_recall_summary()
  271 + #f1 = results.get_f1_summary()
  272 + #f05 = results.get_f05_summary()
  273 + #accuracy = results.get_accuracy_summary()
  274 + #plot_summary(precision,recall,f1,f05,accuracy,log_file)
  275 + plot_roc(roc_points,auc,eauc,coverage,precision_20,log_file)
  276 +
  277 +def run_content(user,cfg):
  278 + for strategy in content_based:
  279 + cfg.strategy = strategy
  280 + for size in profile_size:
  281 + cfg.profile_size = size
  282 + run_strategy(cfg,user)
  283 +
  284 +def run_collaborative(user,cfg):
  285 + popcon_desktopapps = cfg.popcon_desktopapps
  286 + popcon_programs = cfg.popcon_programs
  287 + for strategy in collaborative:
  288 + cfg.strategy = strategy
  289 + for k in neighbors:
  290 + cfg.k_neighbors = k
  291 + #for size in popcon_size:
  292 + # if size:
  293 + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
  294 + # cfg.popcon_programs = popcon_programs+"_"+size
  295 + run_strategy(cfg,user)
  296 +
  297 +def run_hybrid(user,cfg):
  298 + popcon_desktopapps = cfg.popcon_desktopapps
  299 + popcon_programs = cfg.popcon_programs
  300 + for strategy in hybrid:
  301 + cfg.strategy = strategy
  302 + for k in neighbors:
  303 + cfg.k_neighbors = k
  304 + #for size in popcon_size:
  305 + # if size:
  306 + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
  307 + # cfg.popcon_programs = popcon_programs+"_"+size
  308 + for size in profile_size:
  309 + cfg.profile_size = size
  310 + run_strategy(cfg,user)
  311 +
  312 +if __name__ == '__main__':
  313 + #user = LocalSystem()
  314 + #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
  315 +
  316 + cfg = Config()
  317 + #user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
  318 + user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
  319 + #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a5834eb2aba6b6f17312239e0761c70")
  320 + user.filter_pkg_profile(cfg.pkgs_filter)
  321 + user.maximal_pkg_profile()
  322 +
  323 + if "content" in sys.argv or len(sys.argv)<2:
  324 + run_content(user,cfg)
  325 + if "collaborative" in sys.argv or len(sys.argv)<2:
  326 + run_collaborative(user,cfg)
  327 + if "hybrid" in sys.argv or len(sys.argv)<2:
  328 + run_hybrid(user,cfg)
... ...
src/experiments/sample-popcon.py 0 → 100755
... ... @@ -0,0 +1,53 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + sample-popcon - extract a sample from popcon population
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import xapian
  23 +import os
  24 +import random
  25 +import sys
  26 +
  27 +def extract_sample(size,popcon,min_profile,max_profile,output):
  28 + sample = []
  29 + for n in range(1,popcon.get_doccount()+1):
  30 + user = popcon.get_document(n)
  31 + pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")]
  32 + print len(pkgs_profile)
  33 + if len(pkgs_profile)>min_profile and len(pkgs_profile)<=max_profile:
  34 + sample.append(user.get_data())
  35 + print n,len(sample)
  36 + if len(sample)==size:
  37 + break
  38 + with open(("%s-%d-%d"%(output,min_profile,max_profile)),'w') as f:
  39 + for s in sample:
  40 + f.write(s+'\n')
  41 +
  42 +if __name__ == '__main__':
  43 + popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps"))
  44 + print ("Popcon repository size: %d" % popcon.get_doccount())
  45 + try:
  46 + min_profile = int(sys.argv[1])
  47 + max_profile = int(sys.argv[2])
  48 + size = int(sys.argv[3])
  49 + except:
  50 + print "Usage: sample-popcon min_profile max_profile sample_size"
  51 + exit(1)
  52 + sample_file = "results/misc-popcon/sample"
  53 + extract_sample(size,popcon,min_profile,max_profile,sample_file)
... ...