From dc8ededf327ee77c4bf11a528452c1f61c260f2e Mon Sep 17 00:00:00 2001 From: Tássia Camões Araújo Date: Thu, 8 Sep 2011 00:11:48 +0000 Subject: [PATCH] Updated experiments. --- src/experiments/k-suite.py | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/experiments/popcon-population.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/experiments/roc-suite.py | 328 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/experiments/sample-popcon.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 607 insertions(+), 0 deletions(-) create mode 100755 src/experiments/k-suite.py create mode 100755 src/experiments/popcon-population.py create mode 100755 src/experiments/roc-suite.py create mode 100755 src/experiments/sample-popcon.py diff --git a/src/experiments/k-suite.py b/src/experiments/k-suite.py new file mode 100755 index 0000000..99c4665 --- /dev/null +++ b/src/experiments/k-suite.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +""" + recommender suite - recommender experiments suite +""" +__author__ = "Tassia Camoes Araujo " +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" +__license__ = """ + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +""" + +import sys +sys.path.insert(0,'../') +from config import Config +from data import PopconXapianIndex, PopconSubmission +from recommender import Recommender +from user import LocalSystem, User +from evaluation import * +import logging +import random +import Gnuplot +import numpy + +def plot_roc(p,roc_points,log_file): + g = Gnuplot.Gnuplot() + g('set style data points') + g.xlabel('False Positive Rate') + g.ylabel('True Positive Rate') + g('set xrange [0:1.0]') + g('set yrange [0:1.0]') + g.title("Setup: %s" % log_file.split("/")[-1]) + g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"), + Gnuplot.Data(roc_points,title="k %d"%k)) + g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png") + g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1) + +class ExperimentResults: + def __init__(self,repo_size): + self.repository_size = repo_size + self.precision = [] + self.recall = [] + self.fpr = [] + + def add_result(self,ranking,sample): + predicted = RecommendationResult(dict.fromkeys(ranking,1)) + real = RecommendationResult(sample) + evaluation = Evaluation(predicted,real,self.repository_size) + self.precision.append(evaluation.run(Precision())) + self.recall.append(evaluation.run(Recall())) + self.fpr.append(evaluation.run(FPR())) + + # Average ROC by threshold (whici is the size) + def get_roc_point(self): + tpr = self.recall + fpr = self.fpr + return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)] + + def get_precision_summary(self): + return sum(self.precision)/len(self.precision) + + def get_recall_summary(self): + return sum(self.recall)/len(self.recall) + +if __name__ == '__main__': + # experiment parameters + threshold = 20 + iterations = 30 + sample_file = "results/misc-popcon/sample-050-100" + neighbors = [3,5,10,50,100,150,200,300,400,500] + cfg = Config() + cfg.strategy = "knn" + print cfg.popcon_index + sample = [] + with open(sample_file,'r') as f: + for line in f.readlines(): + user_id = line.strip('\n') + sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id)) + # setup dictionaries and files + roc_points = {} + recommended = {} + precisions = {} + aucs = {} + log_file = "results/k-suite/sample-050-100/%s" % (cfg.strategy) + for k in neighbors: + roc_points[k] = [] + recommended[k] = set() + precisions[k] = [] + aucs[k] = [] + with open(log_file+"-k%.3d"%k,'w') as f: + f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k)) + f.write("# roc_point \tp(20) \tauc\n\n") + # main loop per user + for submission_file in sample: + user = PopconSystem(submission_file) + user.filter_pkg_profile(cfg.pkgs_filter) + user.maximal_pkg_profile() + for k in neighbors: + cfg.k_neighbors = k + rec = Recommender(cfg) + repo_size = rec.items_repository.get_doccount() + results = ExperimentResults(repo_size) + # n iterations for same recommender and user + for n in range(iterations): + # Fill sample profile + profile_size = len(user.pkg_profile) + item_score = {} + for pkg in user.pkg_profile: + item_score[pkg] = user.item_score[pkg] + sample = {} + sample_size = int(profile_size*0.9) + for i in range(sample_size): + key = random.choice(item_score.keys()) + sample[key] = item_score.pop(key) + iteration_user = User(item_score) + recommendation = rec.get_recommendation(iteration_user,threshold) + if hasattr(recommendation,"ranking"): + results.add_result(recommendation.ranking,sample) + print "ranking",recommendation.ranking + print "recommended_%d"%k,recommended[k] + recommended[k] = recommended[k].union(recommendation.ranking) + print recommended[k] + # save summary + roc_point = results.get_roc_point() + auc = numpy.trapz(y=[0,roc_point[1],1],x=[0,roc_point[0],1]) + p_20 = results.get_precision_summary() + roc_points[k].append(roc_point) + aucs[k].append(auc) + precisions[k].append(p_20) + with open(log_file+"-k%.3d"%k,'a') as f: + f.write("%s \t%.2f \t%.4f\n" % (str(roc_point),p_20,auc)) + # back to main flow + with open(log_file,'w') as f: + f.write("# k coverage \tp(20) \tauc\n\n") + for k in neighbors: + print "len_recommended_%d"%k,len(recommended[k]) + print "repo_size",repo_size + coverage = len(recommended[k])/float(repo_size) + print coverage + f.write("%d \t%.2f \t%.2f \t%.2fi\n" % + (k,coverage,float(sum(precisions[k]))/len(precisions[k]), + float(sum(aucs[k]))/len(aucs[k]))) + plot_roc(k,roc_points[k],log_file) diff --git a/src/experiments/popcon-population.py b/src/experiments/popcon-population.py new file mode 100755 index 0000000..6d516f6 --- /dev/null +++ b/src/experiments/popcon-population.py @@ -0,0 +1,74 @@ +#! /usr/bin/env python +""" + misc_popcon - misc experiments with popcon data +""" +__author__ = "Tassia Camoes Araujo " +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" +__license__ = """ + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +""" + +import Gnuplot +import xapian +import os +import random +import sys + +def get_population_profile(popcon): + profiles_size = [] + for n in range(1,popcon.get_doccount()): + user = popcon.get_document(n) + pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")] + if len(pkgs_profile)<10: + print "-- profile<10:",user.get_data() + profiles_size.append(len(pkgs_profile)) + max_profile = max(profiles_size) + population_profile = [(n,profiles_size.count(n)) + for n in range(max_profile+1) + if profiles_size.count(n)>0 ] + return population_profile,max_profile + +def get_profile_ranges(population_profile,max_profile,popcon_size): + ranges = range(0,251,50) + ranges.append(max_profile) + ranges_population = [] + ranges_percentage = [] + for maximum in ranges[1:]: + minimum = ranges[ranges.index(maximum)-1] + valid = [x[1] for x in population_profile + if x[0]>minimum and x[0]<=maximum] + ranges_population.append((maximum,sum(valid))) + ranges_percentage.append((maximum,sum(valid)/float(popcon_size))) + return ranges_population,ranges_percentage + +def plot(data,xlabel,ylabel,output): + g = Gnuplot.Gnuplot() + g('set style data points') + g.xlabel(xlabel) + g.ylabel(ylabel) + g.plot(data) + g.hardcopy(output+".png", terminal="png") + g.hardcopy(output+".ps", terminal="postscript", enhanced=1, color=1) + +if __name__ == '__main__': + popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps")) + print ("Popcon repository size: %d" % popcon.get_doccount()) + + profile_population,max_profile = get_population_profile(popcon) + ranges_population,ranges_percentage = get_profile_ranges(profile_population, + max_profile,popcon.get_doccount()) + print "Population per profile range (up to index)" + print ranges_population + plot(profile_population,"Desktop profile size","Population size", + "results/misc-popcon/profile_population") diff --git a/src/experiments/roc-suite.py b/src/experiments/roc-suite.py new file mode 100755 index 0000000..bed679e --- /dev/null +++ b/src/experiments/roc-suite.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python +""" + recommender suite - recommender experiments suite +""" +__author__ = "Tassia Camoes Araujo " +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" +__license__ = """ + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +""" + +import sys +sys.path.insert(0,'../') +from config import Config +from data import PopconXapianIndex, PopconSubmission +from recommender import Recommender +from user import LocalSystem, User +from evaluation import * +import logging +import random +import Gnuplot +import numpy + +#iterations = 3 +#sample_proportions = [0.9] +#weighting = [('bm25',1.2)] +#collaborative = ['knn_eset'] +#content_based = ['cb'] +#hybrid = ['knnco'] +#profile_size = [50,100] +#popcon_size = ["1000"] +#neighbors = [50] + +iterations = 30 +sample_proportions = [0.9] +weighting = [('bm25',1.0),('bm25',1.2),('bm25',2.0),('trad',0)] +content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset'] +collaborative = ['knn_eset','knn','knn_plus'] +hybrid = ['knnco','knnco_eset'] +profile_size = range(20,200,20) +neighbors = range(10,510,50) + +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file): + # Write recall log + output = open(("%s-%.2d" % (log_file,n)),'w') + output.write("# %s-n\n" % label["description"]) + output.write("# %s-%.2d\n" % (label["values"],n)) + output.write("\n# repository profile sample\n%d %d %d\n" % \ + (repo_size,profile_size,len(sample))) + if hasattr(recommendation,"ranking"): + notfound = [] + ranks = [] + for pkg in sample.keys(): + if pkg in recommendation.ranking: + ranks.append(recommendation.ranking.index(pkg)) + else: + notfound.append(pkg) + for r in sorted(ranks): + output.write(str(r)+"\n") + if notfound: + output.write("# out of recommendation:\n") + for pkg in notfound: + output.write(pkg+"\n") + output.close() + +def plot_roc(roc_points,auc,eauc,c,p,log_file): + g = Gnuplot.Gnuplot() + g('set style data lines') + g.xlabel('False Positive Rate') + g.ylabel('True Positive Rate') + g('set xrange [0:1.0]') + g('set yrange [0:1.0]') + g.title("Setup: %s" % log_file.split("/")[-1]) + g('set label "C %.2f" at 0.8,0.25' % c) + g('set label "P(20) %.2f" at 0.8,0.2' % p) + g('set label "AUC %.4f" at 0.8,0.15' % auc) + g('set label "EAUC %.4f" at 0.8,0.1' % eauc) + g.plot(Gnuplot.Data(roc_points,title="ROC"), + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"), + Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6")) + g.hardcopy(log_file+"-roc.png",terminal="png") + g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1) + +def plot_summary(precision,recall,f1,f05,accuracy,log_file): + # Plot metrics summary + g = Gnuplot.Gnuplot() + g('set style data lines') + g.xlabel('Recommendation size') + g.title("Setup: %s" % log_file.split("/")[-1]) + g.plot(Gnuplot.Data(accuracy,title="Accuracy"), + Gnuplot.Data(precision,title="Precision"), + Gnuplot.Data(recall,title="Recall"), + Gnuplot.Data(f1,title="F_1"), + Gnuplot.Data(f05,title="F_0.5")) + g.hardcopy(log_file+".png",terminal="png") + g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1) + g('set logscale x') + g('replot') + g.hardcopy(log_file+"-logscale.png",terminal="png") + g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1) + +def get_label(cfg,sample_proportion): + label = {} + if cfg.strategy in content_based: + label["description"] = "strategy-filter-profile-k1_bm25" + label["values"] = ("%s-profile%.3d-%s-kbm%.1f" % + (cfg.strategy,cfg.profile_size, + cfg.pkgs_filter.split("/")[-1], + cfg.bm25_k1)) + elif cfg.strategy in collaborative: + label["description"] = "strategy-knn-filter-k1_bm25" + label["values"] = ("%s-k%.3d-%s-kbm%.1f" % + (cfg.strategy,cfg.k_neighbors, + cfg.pkgs_filter.split("/")[-1], + cfg.bm25_k1)) + elif cfg.strategy in hybrid: + label["description"] = "strategy-knn-filter-profile-k1_bm25" + label["values"] = ("%s-k%.3d-profile%.3d-%s-kbm%.1f" % + (cfg.strategy,cfg.k_neighbors,cfg.profile_size, + cfg.pkgs_filter.split("/")[-1], + cfg.bm25_k1)) + else: + print "Unknown strategy" + return label + +class ExperimentResults: + def __init__(self,repo_size): + self.repository_size = repo_size + self.accuracy = {} + self.precision = {} + self.recall = {} + self.f1 = {} + self.f05 = {} + self.fpr = {} + #points = [1]+range(10,200,10)+range(200,self.repository_size,100) + points = [1]+range(10,self.repository_size,10) + self.recommended = set() + for size in points: + self.accuracy[size] = [] + self.precision[size] = [] + self.recall[size] = [] + self.f1[size] = [] + self.f05[size] = [] + self.fpr[size] = [] + + def add_result(self,ranking,sample): + print "len_recommended", len(self.recommended) + print "len_rank", len(ranking) + self.recommended = self.recommended.union(ranking) + print "len_recommended", len(self.recommended) + # get data only for point + for size in self.accuracy.keys(): + predicted = RecommendationResult(dict.fromkeys(ranking[:size],1)) + real = RecommendationResult(sample) + evaluation = Evaluation(predicted,real,self.repository_size) + #self.accuracy[size].append(evaluation.run(Accuracy())) + self.precision[size].append(evaluation.run(Precision())) + self.recall[size].append(evaluation.run(Recall())) + #self.f1[size].append(evaluation.run(F_score(1))) + #self.f05[size].append(evaluation.run(F_score(0.5))) + self.fpr[size].append(evaluation.run(FPR())) + + # Average ROC by threshold (whici is the size) + def get_roc_points(self): + points = [] + for size in self.recall.keys(): + tpr = self.recall[size] + fpr = self.fpr[size] + points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)]) + return sorted(points) + + def get_precision_summary(self): + summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()] + return sorted(summary) + + def get_recall_summary(self): + summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()] + return sorted(summary) + + def get_f1_summary(self): + summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()] + return sorted(summary) + + def get_f05_summary(self): + summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()] + return sorted(summary) + + def get_accuracy_summary(self): + summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()] + return sorted(summary) + + def best_precision(self): + size = max(self.precision, key = lambda x: max(self.precision[x]) and x>10) + return (size,max(self.precision[size])) + + def best_f1(self): + size = max(self.f1, key = lambda x: max(self.f1[x])) + return (size,max(self.f1[size])) + + def best_f05(self): + size = max(self.f05, key = lambda x: max(self.f05[x])) + return (size,max(self.f05[size])) + +def run_strategy(cfg,user): + for weight in weighting: + cfg.weight = weight[0] + cfg.bm25_k1 = weight[1] + rec = Recommender(cfg) + repo_size = rec.items_repository.get_doccount() + for proportion in sample_proportions: + results = ExperimentResults(repo_size) + label = get_label(cfg,proportion) + #log_file = "results/20110906/4a67a295/"+label["values"] + log_file = "results/"+label["values"] + for n in range(iterations): + # Fill sample profile + profile_size = len(user.pkg_profile) + item_score = {} + for pkg in user.pkg_profile: + item_score[pkg] = user.item_score[pkg] + sample = {} + sample_size = int(profile_size*proportion) + for i in range(sample_size): + key = random.choice(item_score.keys()) + sample[key] = item_score.pop(key) + iteration_user = User(item_score) + recommendation = rec.get_recommendation(iteration_user,repo_size) + #write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file) + if hasattr(recommendation,"ranking"): + results.add_result(recommendation.ranking,sample) + with open(log_file,'w') as f: + roc_points = results.get_roc_points() + x_coord = [p[0] for p in roc_points] + y_coord = [p[1] for p in roc_points] + auc = numpy.trapz(y=y_coord, x=x_coord) + eauc = (auc+ + numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+ + numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1])) + precision_20 = sum(results.precision[10])/len(results.precision[10]) + print results.recommended + print "len",len(results.recommended) + coverage = len(results.recommended)/float(repo_size) + print "repo_size: ", float(repo_size) + print coverage + exit(1) + #f1_10 = sum(results.f1[10])/len(results.f1[10]) + #f05_10 = sum(results.f05[10])/len(results.f05[10]) + f.write("# %s\n# %s\n\n" % + (label["description"],label["values"])) + f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" % + (coverage,precision_20,auc,eauc)) + #f.write("# best results (recommendation size; metric)\n") + #f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" % + # (results.best_precision()[0],results.best_precision()[1], + # results.best_f1()[0],results.best_f1()[1], + # results.best_f05()[0],results.best_f05()[1])) + #f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" % + # (precision_10,f1_10,f05_10)) + #precision = results.get_precision_summary() + #recall = results.get_recall_summary() + #f1 = results.get_f1_summary() + #f05 = results.get_f05_summary() + #accuracy = results.get_accuracy_summary() + #plot_summary(precision,recall,f1,f05,accuracy,log_file) + plot_roc(roc_points,auc,eauc,coverage,precision_20,log_file) + +def run_content(user,cfg): + for strategy in content_based: + cfg.strategy = strategy + for size in profile_size: + cfg.profile_size = size + run_strategy(cfg,user) + +def run_collaborative(user,cfg): + popcon_desktopapps = cfg.popcon_desktopapps + popcon_programs = cfg.popcon_programs + for strategy in collaborative: + cfg.strategy = strategy + for k in neighbors: + cfg.k_neighbors = k + #for size in popcon_size: + # if size: + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size + # cfg.popcon_programs = popcon_programs+"_"+size + run_strategy(cfg,user) + +def run_hybrid(user,cfg): + popcon_desktopapps = cfg.popcon_desktopapps + popcon_programs = cfg.popcon_programs + for strategy in hybrid: + cfg.strategy = strategy + for k in neighbors: + cfg.k_neighbors = k + #for size in popcon_size: + # if size: + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size + # cfg.popcon_programs = popcon_programs+"_"+size + for size in profile_size: + cfg.profile_size = size + run_strategy(cfg,user) + +if __name__ == '__main__': + #user = LocalSystem() + #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps")) + + cfg = Config() + #user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7") + user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623") + #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a5834eb2aba6b6f17312239e0761c70") + user.filter_pkg_profile(cfg.pkgs_filter) + user.maximal_pkg_profile() + + if "content" in sys.argv or len(sys.argv)<2: + run_content(user,cfg) + if "collaborative" in sys.argv or len(sys.argv)<2: + run_collaborative(user,cfg) + if "hybrid" in sys.argv or len(sys.argv)<2: + run_hybrid(user,cfg) diff --git a/src/experiments/sample-popcon.py b/src/experiments/sample-popcon.py new file mode 100755 index 0000000..f117c18 --- /dev/null +++ b/src/experiments/sample-popcon.py @@ -0,0 +1,53 @@ +#! /usr/bin/env python +""" + sample-popcon - extract a sample from popcon population +""" +__author__ = "Tassia Camoes Araujo " +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" +__license__ = """ + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +""" + +import xapian +import os +import random +import sys + +def extract_sample(size,popcon,min_profile,max_profile,output): + sample = [] + for n in range(1,popcon.get_doccount()+1): + user = popcon.get_document(n) + pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")] + print len(pkgs_profile) + if len(pkgs_profile)>min_profile and len(pkgs_profile)<=max_profile: + sample.append(user.get_data()) + print n,len(sample) + if len(sample)==size: + break + with open(("%s-%d-%d"%(output,min_profile,max_profile)),'w') as f: + for s in sample: + f.write(s+'\n') + +if __name__ == '__main__': + popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps")) + print ("Popcon repository size: %d" % popcon.get_doccount()) + try: + min_profile = int(sys.argv[1]) + max_profile = int(sys.argv[2]) + size = int(sys.argv[3]) + except: + print "Usage: sample-popcon min_profile max_profile sample_size" + exit(1) + sample_file = "results/misc-popcon/sample" + extract_sample(size,popcon,min_profile,max_profile,sample_file) -- libgit2 0.21.2