Updated experiments.

Tássia Camões Araújo
1 parent 78a934e4
Showing 4 changed files with 607 additions and 0 deletions Show diff stats
src/experiments/k-suite.py
src/experiments/popcon-population.py
src/experiments/roc-suite.py
src/experiments/sample-popcon.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+"""
+    recommender suite - recommender experiments suite 
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+
+def plot_roc(p,roc_points,log_file):
+    g = Gnuplot.Gnuplot()
+    g('set style data points')
+    g.xlabel('False Positive Rate')
+    g.ylabel('True Positive Rate')
+    g('set xrange [0:1.0]')
+    g('set yrange [0:1.0]')
+    g.title("Setup: %s" % log_file.split("/")[-1])
+    g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
+           Gnuplot.Data(roc_points,title="k %d"%k))
+    g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
+    g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
+
+class ExperimentResults:
+    def __init__(self,repo_size):
+        self.repository_size = repo_size
+        self.precision = []
+        self.recall = []
+        self.fpr = []
+
+    def add_result(self,ranking,sample):
+        predicted = RecommendationResult(dict.fromkeys(ranking,1))
+        real = RecommendationResult(sample)
+        evaluation = Evaluation(predicted,real,self.repository_size)
+        self.precision.append(evaluation.run(Precision()))
+        self.recall.append(evaluation.run(Recall()))
+        self.fpr.append(evaluation.run(FPR()))
+
+    # Average ROC by threshold (whici is the size)
+    def get_roc_point(self):
+        tpr = self.recall
+        fpr = self.fpr
+        return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
+
+    def get_precision_summary(self):
+        return  sum(self.precision)/len(self.precision)
+
+    def get_recall_summary(self):
+        return  sum(self.recall)/len(self.recall)
+
+if __name__ == '__main__':
+    # experiment parameters
+    threshold = 20
+    iterations = 30
+    sample_file = "results/misc-popcon/sample-050-100"
+    neighbors = [3,5,10,50,100,150,200,300,400,500]
+    cfg = Config()
+    cfg.strategy = "knn"
+    print cfg.popcon_index
+    sample = []
+    with open(sample_file,'r') as f:
+        for line in f.readlines():
+            user_id = line.strip('\n')
+            sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
+    # setup dictionaries and files
+    roc_points = {}
+    recommended = {}
+    precisions = {}
+    aucs = {}
+    log_file = "results/k-suite/sample-050-100/%s" % (cfg.strategy)
+    for k in neighbors:
+        roc_points[k] = []
+        recommended[k] = set()
+        precisions[k] = []
+        aucs[k] = []
+        with open(log_file+"-k%.3d"%k,'w') as f:
+            f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
+            f.write("# roc_point \tp(20) \tauc\n\n") 
+    # main loop per user
+    for submission_file in sample:
+        user = PopconSystem(submission_file)
+        user.filter_pkg_profile(cfg.pkgs_filter)
+        user.maximal_pkg_profile()
+        for k in neighbors:
+            cfg.k_neighbors = k
+            rec = Recommender(cfg)
+            repo_size = rec.items_repository.get_doccount()
+            results = ExperimentResults(repo_size)
+            # n iterations for same recommender and user
+            for n in range(iterations):
+                # Fill sample profile
+                profile_size = len(user.pkg_profile)
+                item_score = {}
+                for pkg in user.pkg_profile:
+                    item_score[pkg] = user.item_score[pkg]
+                sample = {}
+                sample_size = int(profile_size*0.9)
+                for i in range(sample_size):
+                     key = random.choice(item_score.keys())
+                     sample[key] = item_score.pop(key)
+                iteration_user = User(item_score)
+                recommendation = rec.get_recommendation(iteration_user,threshold)
+                if hasattr(recommendation,"ranking"):
+                    results.add_result(recommendation.ranking,sample)
+                    print "ranking",recommendation.ranking
+                    print "recommended_%d"%k,recommended[k]
+                    recommended[k] = recommended[k].union(recommendation.ranking)
+                    print recommended[k]
+            # save summary
+            roc_point = results.get_roc_point()
+            auc = numpy.trapz(y=[0,roc_point[1],1],x=[0,roc_point[0],1])
+            p_20 = results.get_precision_summary()
+            roc_points[k].append(roc_point)
+            aucs[k].append(auc)
+            precisions[k].append(p_20)
+            with open(log_file+"-k%.3d"%k,'a') as f:
+                f.write("%s \t%.2f \t%.4f\n" % (str(roc_point),p_20,auc))
+    # back to main flow
+    with open(log_file,'w') as f:
+        f.write("# k coverage \tp(20) \tauc\n\n")
+        for k in neighbors:
+            print "len_recommended_%d"%k,len(recommended[k])
+            print "repo_size",repo_size
+            coverage = len(recommended[k])/float(repo_size)
+            print coverage
+            f.write("%d \t%.2f \t%.2f \t%.2fi\n" %
+                    (k,coverage,float(sum(precisions[k]))/len(precisions[k]),
+                     float(sum(aucs[k]))/len(aucs[k])))
+            plot_roc(k,roc_points[k],log_file)
@@ -0,0 +1,74 @@
+#! /usr/bin/env python
+"""
+    misc_popcon - misc experiments with popcon data
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import Gnuplot
+import xapian
+import os
+import random
+import sys
+
+def get_population_profile(popcon):
+    profiles_size = []
+    for n in range(1,popcon.get_doccount()):
+        user = popcon.get_document(n)
+        pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")]
+        if len(pkgs_profile)<10:
+            print "-- profile<10:",user.get_data()
+        profiles_size.append(len(pkgs_profile))
+    max_profile = max(profiles_size)
+    population_profile = [(n,profiles_size.count(n))
+                          for n in range(max_profile+1)
+                          if profiles_size.count(n)>0 ]
+    return population_profile,max_profile
+
+def get_profile_ranges(population_profile,max_profile,popcon_size):
+    ranges = range(0,251,50)
+    ranges.append(max_profile)
+    ranges_population = []
+    ranges_percentage = []
+    for maximum in ranges[1:]:
+        minimum = ranges[ranges.index(maximum)-1]
+        valid = [x[1] for x in population_profile
+                 if x[0]>minimum and x[0]<=maximum]
+        ranges_population.append((maximum,sum(valid)))
+        ranges_percentage.append((maximum,sum(valid)/float(popcon_size)))
+    return ranges_population,ranges_percentage
+
+def plot(data,xlabel,ylabel,output):
+    g = Gnuplot.Gnuplot()
+    g('set style data points')
+    g.xlabel(xlabel)
+    g.ylabel(ylabel)
+    g.plot(data)
+    g.hardcopy(output+".png", terminal="png")
+    g.hardcopy(output+".ps", terminal="postscript", enhanced=1, color=1)
+
+if __name__ == '__main__':
+    popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps"))
+    print ("Popcon repository size: %d" % popcon.get_doccount())
+
+    profile_population,max_profile =  get_population_profile(popcon)
+    ranges_population,ranges_percentage = get_profile_ranges(profile_population,
+                                                             max_profile,popcon.get_doccount())
+    print "Population per profile range (up to index)"
+    print ranges_population
+    plot(profile_population,"Desktop profile size","Population size",
+         "results/misc-popcon/profile_population")
@@ -0,0 +1,328 @@
+#!/usr/bin/env python
+"""
+    recommender suite - recommender experiments suite 
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+
+#iterations = 3
+#sample_proportions = [0.9]
+#weighting = [('bm25',1.2)]
+#collaborative = ['knn_eset']
+#content_based = ['cb']
+#hybrid = ['knnco']
+#profile_size = [50,100]
+#popcon_size = ["1000"]
+#neighbors = [50]
+
+iterations = 30
+sample_proportions = [0.9]
+weighting = [('bm25',1.0),('bm25',1.2),('bm25',2.0),('trad',0)]
+content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
+collaborative = ['knn_eset','knn','knn_plus']
+hybrid = ['knnco','knnco_eset']
+profile_size = range(20,200,20)
+neighbors = range(10,510,50)
+
+def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
+    # Write recall log
+    output = open(("%s-%.2d" % (log_file,n)),'w')
+    output.write("# %s-n\n" % label["description"])
+    output.write("# %s-%.2d\n" % (label["values"],n))
+    output.write("\n# repository profile sample\n%d %d %d\n" % \
+                 (repo_size,profile_size,len(sample)))
+    if hasattr(recommendation,"ranking"):
+        notfound = []
+        ranks = []
+        for pkg in sample.keys():
+            if pkg in recommendation.ranking:
+                ranks.append(recommendation.ranking.index(pkg))
+            else:
+                notfound.append(pkg)
+        for r in sorted(ranks):
+            output.write(str(r)+"\n")
+        if notfound:
+            output.write("# out of recommendation:\n")
+            for pkg in notfound:
+                output.write(pkg+"\n")
+    output.close()
+
+def plot_roc(roc_points,auc,eauc,c,p,log_file):
+    g = Gnuplot.Gnuplot()
+    g('set style data lines')
+    g.xlabel('False Positive Rate')
+    g.ylabel('True Positive Rate')
+    g('set xrange [0:1.0]')
+    g('set yrange [0:1.0]')
+    g.title("Setup: %s" % log_file.split("/")[-1])
+    g('set label "C %.2f" at 0.8,0.25' % c)
+    g('set label "P(20) %.2f" at 0.8,0.2' % p)
+    g('set label "AUC %.4f" at 0.8,0.15' % auc)
+    g('set label "EAUC %.4f" at 0.8,0.1' % eauc)
+    g.plot(Gnuplot.Data(roc_points,title="ROC"),
+           Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
+           Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
+    g.hardcopy(log_file+"-roc.png",terminal="png")
+    g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
+
+def plot_summary(precision,recall,f1,f05,accuracy,log_file):
+    # Plot metrics summary
+    g = Gnuplot.Gnuplot()
+    g('set style data lines')
+    g.xlabel('Recommendation size')
+    g.title("Setup: %s" % log_file.split("/")[-1])
+    g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
+           Gnuplot.Data(precision,title="Precision"),
+           Gnuplot.Data(recall,title="Recall"),
+           Gnuplot.Data(f1,title="F_1"),
+           Gnuplot.Data(f05,title="F_0.5"))
+    g.hardcopy(log_file+".png",terminal="png")
+    g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
+    g('set logscale x')
+    g('replot')
+    g.hardcopy(log_file+"-logscale.png",terminal="png")
+    g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
+
+def get_label(cfg,sample_proportion):
+    label = {}
+    if cfg.strategy in content_based:
+        label["description"] = "strategy-filter-profile-k1_bm25"
+        label["values"] = ("%s-profile%.3d-%s-kbm%.1f" %
+                           (cfg.strategy,cfg.profile_size,
+                            cfg.pkgs_filter.split("/")[-1],
+                            cfg.bm25_k1))
+    elif cfg.strategy in collaborative:
+       label["description"] = "strategy-knn-filter-k1_bm25"
+       label["values"] = ("%s-k%.3d-%s-kbm%.1f" %
+                          (cfg.strategy,cfg.k_neighbors,
+                           cfg.pkgs_filter.split("/")[-1],
+                           cfg.bm25_k1))
+    elif cfg.strategy in hybrid:
+       label["description"] = "strategy-knn-filter-profile-k1_bm25"
+       label["values"] = ("%s-k%.3d-profile%.3d-%s-kbm%.1f" %
+                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
+                           cfg.pkgs_filter.split("/")[-1],
+                           cfg.bm25_k1))
+    else:
+        print "Unknown strategy"
+    return label
+
+class ExperimentResults:
+    def __init__(self,repo_size):
+        self.repository_size = repo_size
+        self.accuracy = {}
+        self.precision = {}
+        self.recall = {}
+        self.f1 = {}
+        self.f05 = {}
+        self.fpr = {}
+        #points = [1]+range(10,200,10)+range(200,self.repository_size,100)
+        points = [1]+range(10,self.repository_size,10)
+        self.recommended = set()
+        for size in points:
+            self.accuracy[size] = []
+            self.precision[size] = []
+            self.recall[size] = []
+            self.f1[size] = []
+            self.f05[size] = []
+            self.fpr[size] = []
+
+    def add_result(self,ranking,sample):
+        print "len_recommended", len(self.recommended)
+        print "len_rank", len(ranking)
+        self.recommended = self.recommended.union(ranking)
+        print "len_recommended", len(self.recommended)
+        # get data only for point
+        for size in self.accuracy.keys():
+            predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
+            real = RecommendationResult(sample)
+            evaluation = Evaluation(predicted,real,self.repository_size)
+            #self.accuracy[size].append(evaluation.run(Accuracy()))
+            self.precision[size].append(evaluation.run(Precision()))
+            self.recall[size].append(evaluation.run(Recall()))
+            #self.f1[size].append(evaluation.run(F_score(1)))
+            #self.f05[size].append(evaluation.run(F_score(0.5)))
+            self.fpr[size].append(evaluation.run(FPR()))
+
+    # Average ROC by threshold (whici is the size)
+    def get_roc_points(self):
+        points = []
+        for size in self.recall.keys():
+            tpr = self.recall[size]
+            fpr = self.fpr[size]
+            points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
+        return sorted(points)
+
+    def get_precision_summary(self):
+        summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
+        return sorted(summary)
+
+    def get_recall_summary(self):
+        summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
+        return sorted(summary)
+
+    def get_f1_summary(self):
+        summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
+        return sorted(summary)
+
+    def get_f05_summary(self):
+        summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
+        return sorted(summary)
+
+    def get_accuracy_summary(self):
+        summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
+        return sorted(summary)
+
+    def best_precision(self):
+        size = max(self.precision, key = lambda x: max(self.precision[x]) and x>10)
+        return (size,max(self.precision[size]))
+
+    def best_f1(self):
+        size = max(self.f1, key = lambda x: max(self.f1[x]))
+        return (size,max(self.f1[size]))
+
+    def best_f05(self):
+        size = max(self.f05, key = lambda x: max(self.f05[x]))
+        return (size,max(self.f05[size]))
+
+def run_strategy(cfg,user):
+    for weight in weighting:
+        cfg.weight = weight[0]
+        cfg.bm25_k1 = weight[1]
+        rec = Recommender(cfg)
+        repo_size = rec.items_repository.get_doccount()
+        for proportion in sample_proportions:
+            results = ExperimentResults(repo_size)
+            label = get_label(cfg,proportion)
+            #log_file = "results/20110906/4a67a295/"+label["values"]
+            log_file = "results/"+label["values"]
+            for n in range(iterations):
+                # Fill sample profile
+                profile_size = len(user.pkg_profile)
+                item_score = {}
+                for pkg in user.pkg_profile:
+                    item_score[pkg] = user.item_score[pkg]
+                sample = {}
+                sample_size = int(profile_size*proportion)
+                for i in range(sample_size):
+                     key = random.choice(item_score.keys())
+                     sample[key] = item_score.pop(key)
+                iteration_user = User(item_score)
+                recommendation = rec.get_recommendation(iteration_user,repo_size)
+                #write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
+                if hasattr(recommendation,"ranking"):
+                    results.add_result(recommendation.ranking,sample)
+            with open(log_file,'w') as f:
+                roc_points = results.get_roc_points()
+                x_coord = [p[0] for p in roc_points]
+                y_coord = [p[1] for p in roc_points]
+                auc = numpy.trapz(y=y_coord, x=x_coord)
+                eauc = (auc+
+                        numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
+                        numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
+                precision_20 = sum(results.precision[10])/len(results.precision[10])
+                print results.recommended
+                print "len",len(results.recommended)
+                coverage = len(results.recommended)/float(repo_size)
+                print "repo_size: ", float(repo_size)
+                print coverage
+                exit(1)
+                #f1_10 = sum(results.f1[10])/len(results.f1[10])
+                #f05_10 = sum(results.f05[10])/len(results.f05[10])
+                f.write("# %s\n# %s\n\n" %
+                        (label["description"],label["values"]))
+                f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
+                        (coverage,precision_20,auc,eauc))
+                #f.write("# best results (recommendation size; metric)\n")
+                #f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
+                #        (results.best_precision()[0],results.best_precision()[1],
+                #         results.best_f1()[0],results.best_f1()[1],
+                #         results.best_f05()[0],results.best_f05()[1]))
+                #f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
+                #        (precision_10,f1_10,f05_10))
+            #precision = results.get_precision_summary()
+            #recall = results.get_recall_summary()
+            #f1 = results.get_f1_summary()
+            #f05 = results.get_f05_summary()
+            #accuracy = results.get_accuracy_summary()
+            #plot_summary(precision,recall,f1,f05,accuracy,log_file)
+            plot_roc(roc_points,auc,eauc,coverage,precision_20,log_file)
+
+def run_content(user,cfg):
+    for strategy in content_based:
+        cfg.strategy = strategy
+        for size in profile_size:
+            cfg.profile_size = size
+            run_strategy(cfg,user)
+
+def run_collaborative(user,cfg):
+    popcon_desktopapps = cfg.popcon_desktopapps
+    popcon_programs = cfg.popcon_programs
+    for strategy in collaborative:
+        cfg.strategy = strategy
+        for k in neighbors:
+            cfg.k_neighbors = k
+            #for size in popcon_size:
+            #    if size:
+            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
+            #        cfg.popcon_programs = popcon_programs+"_"+size
+            run_strategy(cfg,user)
+
+def run_hybrid(user,cfg):
+    popcon_desktopapps = cfg.popcon_desktopapps
+    popcon_programs = cfg.popcon_programs
+    for strategy in hybrid:
+        cfg.strategy = strategy
+        for k in neighbors:
+            cfg.k_neighbors = k
+            #for size in popcon_size:
+            #    if size:
+            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
+            #        cfg.popcon_programs = popcon_programs+"_"+size
+            for size in profile_size:
+                cfg.profile_size = size
+                run_strategy(cfg,user)
+
+if __name__ == '__main__':
+    #user = LocalSystem()
+    #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
+
+    cfg = Config()
+    #user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
+    user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
+    #user =  PopconSystem("/root/.app-recommender/popcon-entries/4a/4a5834eb2aba6b6f17312239e0761c70")
+    user.filter_pkg_profile(cfg.pkgs_filter)
+    user.maximal_pkg_profile()
+
+    if "content" in sys.argv or len(sys.argv)<2:
+        run_content(user,cfg)
+    if "collaborative" in sys.argv or len(sys.argv)<2:
+        run_collaborative(user,cfg)
+    if "hybrid" in sys.argv or len(sys.argv)<2:
+        run_hybrid(user,cfg)
@@ -0,0 +1,53 @@
+#! /usr/bin/env python
+"""
+    sample-popcon - extract a sample from popcon population
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import xapian
+import os
+import random
+import sys
+
+def extract_sample(size,popcon,min_profile,max_profile,output):
+    sample = []
+    for n in range(1,popcon.get_doccount()+1):
+        user = popcon.get_document(n)
+        pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")]
+        print len(pkgs_profile)
+        if len(pkgs_profile)>min_profile and len(pkgs_profile)<=max_profile:
+            sample.append(user.get_data())
+        print n,len(sample)
+        if len(sample)==size:
+            break
+    with open(("%s-%d-%d"%(output,min_profile,max_profile)),'w') as f:
+        for s in sample:
+            f.write(s+'\n')
+
+if __name__ == '__main__':
+    popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps"))
+    print ("Popcon repository size: %d" % popcon.get_doccount())
+    try:
+        min_profile = int(sys.argv[1])
+        max_profile = int(sys.argv[2])
+        size = int(sys.argv[3])
+    except:
+        print "Usage: sample-popcon min_profile max_profile sample_size"
+        exit(1)
+    sample_file = "results/misc-popcon/sample"
+    extract_sample(size,popcon,min_profile,max_profile,sample_file)
@@ -0,0 +1,152 @@		@@ -0,0 +1,152 @@
	1	+#!/usr/bin/env python
	2	+"""
	3	+ recommender suite - recommender experiments suite
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import sys
	23	+sys.path.insert(0,'../')
	24	+from config import Config
	25	+from data import PopconXapianIndex, PopconSubmission
	26	+from recommender import Recommender
	27	+from user import LocalSystem, User
	28	+from evaluation import *
	29	+import logging
	30	+import random
	31	+import Gnuplot
	32	+import numpy
	33	+
	34	+def plot_roc(p,roc_points,log_file):
	35	+ g = Gnuplot.Gnuplot()
	36	+ g('set style data points')
	37	+ g.xlabel('False Positive Rate')
	38	+ g.ylabel('True Positive Rate')
	39	+ g('set xrange [0:1.0]')
	40	+ g('set yrange [0:1.0]')
	41	+ g.title("Setup: %s" % log_file.split("/")[-1])
	42	+ g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
	43	+ Gnuplot.Data(roc_points,title="k %d"%k))
	44	+ g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
	45	+ g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
	46	+
	47	+class ExperimentResults:
	48	+ def __init__(self,repo_size):
	49	+ self.repository_size = repo_size
	50	+ self.precision = []
	51	+ self.recall = []
	52	+ self.fpr = []
	53	+
	54	+ def add_result(self,ranking,sample):
	55	+ predicted = RecommendationResult(dict.fromkeys(ranking,1))
	56	+ real = RecommendationResult(sample)
	57	+ evaluation = Evaluation(predicted,real,self.repository_size)
	58	+ self.precision.append(evaluation.run(Precision()))
	59	+ self.recall.append(evaluation.run(Recall()))
	60	+ self.fpr.append(evaluation.run(FPR()))
	61	+
	62	+ # Average ROC by threshold (whici is the size)
	63	+ def get_roc_point(self):
	64	+ tpr = self.recall
	65	+ fpr = self.fpr
	66	+ return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
	67	+
	68	+ def get_precision_summary(self):
	69	+ return sum(self.precision)/len(self.precision)
	70	+
	71	+ def get_recall_summary(self):
	72	+ return sum(self.recall)/len(self.recall)
	73	+
	74	+if __name__ == '__main__':
	75	+ # experiment parameters
	76	+ threshold = 20
	77	+ iterations = 30
	78	+ sample_file = "results/misc-popcon/sample-050-100"
	79	+ neighbors = [3,5,10,50,100,150,200,300,400,500]
	80	+ cfg = Config()
	81	+ cfg.strategy = "knn"
	82	+ print cfg.popcon_index
	83	+ sample = []
	84	+ with open(sample_file,'r') as f:
	85	+ for line in f.readlines():
	86	+ user_id = line.strip('\n')
	87	+ sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
	88	+ # setup dictionaries and files
	89	+ roc_points = {}
	90	+ recommended = {}
	91	+ precisions = {}
	92	+ aucs = {}
	93	+ log_file = "results/k-suite/sample-050-100/%s" % (cfg.strategy)
	94	+ for k in neighbors:
	95	+ roc_points[k] = []
	96	+ recommended[k] = set()
	97	+ precisions[k] = []
	98	+ aucs[k] = []
	99	+ with open(log_file+"-k%.3d"%k,'w') as f:
	100	+ f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
	101	+ f.write("# roc_point \tp(20) \tauc\n\n")
	102	+ # main loop per user
	103	+ for submission_file in sample:
	104	+ user = PopconSystem(submission_file)
	105	+ user.filter_pkg_profile(cfg.pkgs_filter)
	106	+ user.maximal_pkg_profile()
	107	+ for k in neighbors:
	108	+ cfg.k_neighbors = k
	109	+ rec = Recommender(cfg)
	110	+ repo_size = rec.items_repository.get_doccount()
	111	+ results = ExperimentResults(repo_size)
	112	+ # n iterations for same recommender and user
	113	+ for n in range(iterations):
	114	+ # Fill sample profile
	115	+ profile_size = len(user.pkg_profile)
	116	+ item_score = {}
	117	+ for pkg in user.pkg_profile:
	118	+ item_score[pkg] = user.item_score[pkg]
	119	+ sample = {}
	120	+ sample_size = int(profile_size*0.9)
	121	+ for i in range(sample_size):
	122	+ key = random.choice(item_score.keys())
	123	+ sample[key] = item_score.pop(key)
	124	+ iteration_user = User(item_score)
	125	+ recommendation = rec.get_recommendation(iteration_user,threshold)
	126	+ if hasattr(recommendation,"ranking"):
	127	+ results.add_result(recommendation.ranking,sample)
	128	+ print "ranking",recommendation.ranking
	129	+ print "recommended_%d"%k,recommended[k]
	130	+ recommended[k] = recommended[k].union(recommendation.ranking)
	131	+ print recommended[k]
	132	+ # save summary
	133	+ roc_point = results.get_roc_point()
	134	+ auc = numpy.trapz(y=[0,roc_point[1],1],x=[0,roc_point[0],1])
	135	+ p_20 = results.get_precision_summary()
	136	+ roc_points[k].append(roc_point)
	137	+ aucs[k].append(auc)
	138	+ precisions[k].append(p_20)
	139	+ with open(log_file+"-k%.3d"%k,'a') as f:
	140	+ f.write("%s \t%.2f \t%.4f\n" % (str(roc_point),p_20,auc))
	141	+ # back to main flow
	142	+ with open(log_file,'w') as f:
	143	+ f.write("# k coverage \tp(20) \tauc\n\n")
	144	+ for k in neighbors:
	145	+ print "len_recommended_%d"%k,len(recommended[k])
	146	+ print "repo_size",repo_size
	147	+ coverage = len(recommended[k])/float(repo_size)
	148	+ print coverage
	149	+ f.write("%d \t%.2f \t%.2f \t%.2fi\n" %
	150	+ (k,coverage,float(sum(precisions[k]))/len(precisions[k]),
	151	+ float(sum(aucs[k]))/len(aucs[k])))
	152	+ plot_roc(k,roc_points[k],log_file)
@@ -0,0 +1,74 @@		@@ -0,0 +1,74 @@
	1	+#! /usr/bin/env python
	2	+"""
	3	+ misc_popcon - misc experiments with popcon data
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import Gnuplot
	23	+import xapian
	24	+import os
	25	+import random
	26	+import sys
	27	+
	28	+def get_population_profile(popcon):
	29	+ profiles_size = []
	30	+ for n in range(1,popcon.get_doccount()):
	31	+ user = popcon.get_document(n)
	32	+ pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")]
	33	+ if len(pkgs_profile)<10:
	34	+ print "-- profile<10:",user.get_data()
	35	+ profiles_size.append(len(pkgs_profile))
	36	+ max_profile = max(profiles_size)
	37	+ population_profile = [(n,profiles_size.count(n))
	38	+ for n in range(max_profile+1)
	39	+ if profiles_size.count(n)>0 ]
	40	+ return population_profile,max_profile
	41	+
	42	+def get_profile_ranges(population_profile,max_profile,popcon_size):
	43	+ ranges = range(0,251,50)
	44	+ ranges.append(max_profile)
	45	+ ranges_population = []
	46	+ ranges_percentage = []
	47	+ for maximum in ranges[1:]:
	48	+ minimum = ranges[ranges.index(maximum)-1]
	49	+ valid = [x[1] for x in population_profile
	50	+ if x[0]>minimum and x[0]<=maximum]
	51	+ ranges_population.append((maximum,sum(valid)))
	52	+ ranges_percentage.append((maximum,sum(valid)/float(popcon_size)))
	53	+ return ranges_population,ranges_percentage
	54	+
	55	+def plot(data,xlabel,ylabel,output):
	56	+ g = Gnuplot.Gnuplot()
	57	+ g('set style data points')
	58	+ g.xlabel(xlabel)
	59	+ g.ylabel(ylabel)
	60	+ g.plot(data)
	61	+ g.hardcopy(output+".png", terminal="png")
	62	+ g.hardcopy(output+".ps", terminal="postscript", enhanced=1, color=1)
	63	+
	64	+if __name__ == '__main__':
	65	+ popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps"))
	66	+ print ("Popcon repository size: %d" % popcon.get_doccount())
	67	+
	68	+ profile_population,max_profile = get_population_profile(popcon)
	69	+ ranges_population,ranges_percentage = get_profile_ranges(profile_population,
	70	+ max_profile,popcon.get_doccount())
	71	+ print "Population per profile range (up to index)"
	72	+ print ranges_population
	73	+ plot(profile_population,"Desktop profile size","Population size",
	74	+ "results/misc-popcon/profile_population")
@@ -0,0 +1,328 @@		@@ -0,0 +1,328 @@
	1	+#!/usr/bin/env python
	2	+"""
	3	+ recommender suite - recommender experiments suite
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import sys
	23	+sys.path.insert(0,'../')
	24	+from config import Config
	25	+from data import PopconXapianIndex, PopconSubmission
	26	+from recommender import Recommender
	27	+from user import LocalSystem, User
	28	+from evaluation import *
	29	+import logging
	30	+import random
	31	+import Gnuplot
	32	+import numpy
	33	+
	34	+#iterations = 3
	35	+#sample_proportions = [0.9]
	36	+#weighting = [('bm25',1.2)]
	37	+#collaborative = ['knn_eset']
	38	+#content_based = ['cb']
	39	+#hybrid = ['knnco']
	40	+#profile_size = [50,100]
	41	+#popcon_size = ["1000"]
	42	+#neighbors = [50]
	43	+
	44	+iterations = 30
	45	+sample_proportions = [0.9]
	46	+weighting = [('bm25',1.0),('bm25',1.2),('bm25',2.0),('trad',0)]
	47	+content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
	48	+collaborative = ['knn_eset','knn','knn_plus']
	49	+hybrid = ['knnco','knnco_eset']
	50	+profile_size = range(20,200,20)
	51	+neighbors = range(10,510,50)
	52	+
	53	+def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
	54	+ # Write recall log
	55	+ output = open(("%s-%.2d" % (log_file,n)),'w')
	56	+ output.write("# %s-n\n" % label["description"])
	57	+ output.write("# %s-%.2d\n" % (label["values"],n))
	58	+ output.write("\n# repository profile sample\n%d %d %d\n" % \
	59	+ (repo_size,profile_size,len(sample)))
	60	+ if hasattr(recommendation,"ranking"):
	61	+ notfound = []
	62	+ ranks = []
	63	+ for pkg in sample.keys():
	64	+ if pkg in recommendation.ranking:
	65	+ ranks.append(recommendation.ranking.index(pkg))
	66	+ else:
	67	+ notfound.append(pkg)
	68	+ for r in sorted(ranks):
	69	+ output.write(str(r)+"\n")
	70	+ if notfound:
	71	+ output.write("# out of recommendation:\n")
	72	+ for pkg in notfound:
	73	+ output.write(pkg+"\n")
	74	+ output.close()
	75	+
	76	+def plot_roc(roc_points,auc,eauc,c,p,log_file):
	77	+ g = Gnuplot.Gnuplot()
	78	+ g('set style data lines')
	79	+ g.xlabel('False Positive Rate')
	80	+ g.ylabel('True Positive Rate')
	81	+ g('set xrange [0:1.0]')
	82	+ g('set yrange [0:1.0]')
	83	+ g.title("Setup: %s" % log_file.split("/")[-1])
	84	+ g('set label "C %.2f" at 0.8,0.25' % c)
	85	+ g('set label "P(20) %.2f" at 0.8,0.2' % p)
	86	+ g('set label "AUC %.4f" at 0.8,0.15' % auc)
	87	+ g('set label "EAUC %.4f" at 0.8,0.1' % eauc)
	88	+ g.plot(Gnuplot.Data(roc_points,title="ROC"),
	89	+ Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
	90	+ Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
	91	+ g.hardcopy(log_file+"-roc.png",terminal="png")
	92	+ g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
	93	+
	94	+def plot_summary(precision,recall,f1,f05,accuracy,log_file):
	95	+ # Plot metrics summary
	96	+ g = Gnuplot.Gnuplot()
	97	+ g('set style data lines')
	98	+ g.xlabel('Recommendation size')
	99	+ g.title("Setup: %s" % log_file.split("/")[-1])
	100	+ g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
	101	+ Gnuplot.Data(precision,title="Precision"),
	102	+ Gnuplot.Data(recall,title="Recall"),
	103	+ Gnuplot.Data(f1,title="F_1"),
	104	+ Gnuplot.Data(f05,title="F_0.5"))
	105	+ g.hardcopy(log_file+".png",terminal="png")
	106	+ g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
	107	+ g('set logscale x')
	108	+ g('replot')
	109	+ g.hardcopy(log_file+"-logscale.png",terminal="png")
	110	+ g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
	111	+
	112	+def get_label(cfg,sample_proportion):
	113	+ label = {}
	114	+ if cfg.strategy in content_based:
	115	+ label["description"] = "strategy-filter-profile-k1_bm25"
	116	+ label["values"] = ("%s-profile%.3d-%s-kbm%.1f" %
	117	+ (cfg.strategy,cfg.profile_size,
	118	+ cfg.pkgs_filter.split("/")[-1],
	119	+ cfg.bm25_k1))
	120	+ elif cfg.strategy in collaborative:
	121	+ label["description"] = "strategy-knn-filter-k1_bm25"
	122	+ label["values"] = ("%s-k%.3d-%s-kbm%.1f" %
	123	+ (cfg.strategy,cfg.k_neighbors,
	124	+ cfg.pkgs_filter.split("/")[-1],
	125	+ cfg.bm25_k1))
	126	+ elif cfg.strategy in hybrid:
	127	+ label["description"] = "strategy-knn-filter-profile-k1_bm25"
	128	+ label["values"] = ("%s-k%.3d-profile%.3d-%s-kbm%.1f" %
	129	+ (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
	130	+ cfg.pkgs_filter.split("/")[-1],
	131	+ cfg.bm25_k1))
	132	+ else:
	133	+ print "Unknown strategy"
	134	+ return label
	135	+
	136	+class ExperimentResults:
	137	+ def __init__(self,repo_size):
	138	+ self.repository_size = repo_size
	139	+ self.accuracy = {}
	140	+ self.precision = {}
	141	+ self.recall = {}
	142	+ self.f1 = {}
	143	+ self.f05 = {}
	144	+ self.fpr = {}
	145	+ #points = [1]+range(10,200,10)+range(200,self.repository_size,100)
	146	+ points = [1]+range(10,self.repository_size,10)
	147	+ self.recommended = set()
	148	+ for size in points:
	149	+ self.accuracy[size] = []
	150	+ self.precision[size] = []
	151	+ self.recall[size] = []
	152	+ self.f1[size] = []
	153	+ self.f05[size] = []
	154	+ self.fpr[size] = []
	155	+
	156	+ def add_result(self,ranking,sample):
	157	+ print "len_recommended", len(self.recommended)
	158	+ print "len_rank", len(ranking)
	159	+ self.recommended = self.recommended.union(ranking)
	160	+ print "len_recommended", len(self.recommended)
	161	+ # get data only for point
	162	+ for size in self.accuracy.keys():
	163	+ predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
	164	+ real = RecommendationResult(sample)
	165	+ evaluation = Evaluation(predicted,real,self.repository_size)
	166	+ #self.accuracy[size].append(evaluation.run(Accuracy()))
	167	+ self.precision[size].append(evaluation.run(Precision()))
	168	+ self.recall[size].append(evaluation.run(Recall()))
	169	+ #self.f1[size].append(evaluation.run(F_score(1)))
	170	+ #self.f05[size].append(evaluation.run(F_score(0.5)))
	171	+ self.fpr[size].append(evaluation.run(FPR()))
	172	+
	173	+ # Average ROC by threshold (whici is the size)
	174	+ def get_roc_points(self):
	175	+ points = []
	176	+ for size in self.recall.keys():
	177	+ tpr = self.recall[size]
	178	+ fpr = self.fpr[size]
	179	+ points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
	180	+ return sorted(points)
	181	+
	182	+ def get_precision_summary(self):
	183	+ summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
	184	+ return sorted(summary)
	185	+
	186	+ def get_recall_summary(self):
	187	+ summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
	188	+ return sorted(summary)
	189	+
	190	+ def get_f1_summary(self):
	191	+ summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
	192	+ return sorted(summary)
	193	+
	194	+ def get_f05_summary(self):
	195	+ summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
	196	+ return sorted(summary)
	197	+
	198	+ def get_accuracy_summary(self):
	199	+ summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
	200	+ return sorted(summary)
	201	+
	202	+ def best_precision(self):
	203	+ size = max(self.precision, key = lambda x: max(self.precision[x]) and x>10)
	204	+ return (size,max(self.precision[size]))
	205	+
	206	+ def best_f1(self):
	207	+ size = max(self.f1, key = lambda x: max(self.f1[x]))
	208	+ return (size,max(self.f1[size]))
	209	+
	210	+ def best_f05(self):
	211	+ size = max(self.f05, key = lambda x: max(self.f05[x]))
	212	+ return (size,max(self.f05[size]))
	213	+
	214	+def run_strategy(cfg,user):
	215	+ for weight in weighting:
	216	+ cfg.weight = weight[0]
	217	+ cfg.bm25_k1 = weight[1]
	218	+ rec = Recommender(cfg)
	219	+ repo_size = rec.items_repository.get_doccount()
	220	+ for proportion in sample_proportions:
	221	+ results = ExperimentResults(repo_size)
	222	+ label = get_label(cfg,proportion)
	223	+ #log_file = "results/20110906/4a67a295/"+label["values"]
	224	+ log_file = "results/"+label["values"]
	225	+ for n in range(iterations):
	226	+ # Fill sample profile
	227	+ profile_size = len(user.pkg_profile)
	228	+ item_score = {}
	229	+ for pkg in user.pkg_profile:
	230	+ item_score[pkg] = user.item_score[pkg]
	231	+ sample = {}
	232	+ sample_size = int(profile_size*proportion)
	233	+ for i in range(sample_size):
	234	+ key = random.choice(item_score.keys())
	235	+ sample[key] = item_score.pop(key)
	236	+ iteration_user = User(item_score)
	237	+ recommendation = rec.get_recommendation(iteration_user,repo_size)
	238	+ #write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
	239	+ if hasattr(recommendation,"ranking"):
	240	+ results.add_result(recommendation.ranking,sample)
	241	+ with open(log_file,'w') as f:
	242	+ roc_points = results.get_roc_points()
	243	+ x_coord = [p[0] for p in roc_points]
	244	+ y_coord = [p[1] for p in roc_points]
	245	+ auc = numpy.trapz(y=y_coord, x=x_coord)
	246	+ eauc = (auc+
	247	+ numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
	248	+ numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
	249	+ precision_20 = sum(results.precision[10])/len(results.precision[10])
	250	+ print results.recommended
	251	+ print "len",len(results.recommended)
	252	+ coverage = len(results.recommended)/float(repo_size)
	253	+ print "repo_size: ", float(repo_size)
	254	+ print coverage
	255	+ exit(1)
	256	+ #f1_10 = sum(results.f1[10])/len(results.f1[10])
	257	+ #f05_10 = sum(results.f05[10])/len(results.f05[10])
	258	+ f.write("# %s\n# %s\n\n" %
	259	+ (label["description"],label["values"]))
	260	+ f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
	261	+ (coverage,precision_20,auc,eauc))
	262	+ #f.write("# best results (recommendation size; metric)\n")
	263	+ #f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
	264	+ # (results.best_precision()[0],results.best_precision()[1],
	265	+ # results.best_f1()[0],results.best_f1()[1],
	266	+ # results.best_f05()[0],results.best_f05()[1]))
	267	+ #f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
	268	+ # (precision_10,f1_10,f05_10))
	269	+ #precision = results.get_precision_summary()
	270	+ #recall = results.get_recall_summary()
	271	+ #f1 = results.get_f1_summary()
	272	+ #f05 = results.get_f05_summary()
	273	+ #accuracy = results.get_accuracy_summary()
	274	+ #plot_summary(precision,recall,f1,f05,accuracy,log_file)
	275	+ plot_roc(roc_points,auc,eauc,coverage,precision_20,log_file)
	276	+
	277	+def run_content(user,cfg):
	278	+ for strategy in content_based:
	279	+ cfg.strategy = strategy
	280	+ for size in profile_size:
	281	+ cfg.profile_size = size
	282	+ run_strategy(cfg,user)
	283	+
	284	+def run_collaborative(user,cfg):
	285	+ popcon_desktopapps = cfg.popcon_desktopapps
	286	+ popcon_programs = cfg.popcon_programs
	287	+ for strategy in collaborative:
	288	+ cfg.strategy = strategy
	289	+ for k in neighbors:
	290	+ cfg.k_neighbors = k
	291	+ #for size in popcon_size:
	292	+ # if size:
	293	+ # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
	294	+ # cfg.popcon_programs = popcon_programs+"_"+size
	295	+ run_strategy(cfg,user)
	296	+
	297	+def run_hybrid(user,cfg):
	298	+ popcon_desktopapps = cfg.popcon_desktopapps
	299	+ popcon_programs = cfg.popcon_programs
	300	+ for strategy in hybrid:
	301	+ cfg.strategy = strategy
	302	+ for k in neighbors:
	303	+ cfg.k_neighbors = k
	304	+ #for size in popcon_size:
	305	+ # if size:
	306	+ # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
	307	+ # cfg.popcon_programs = popcon_programs+"_"+size
	308	+ for size in profile_size:
	309	+ cfg.profile_size = size
	310	+ run_strategy(cfg,user)
	311	+
	312	+if __name__ == '__main__':
	313	+ #user = LocalSystem()
	314	+ #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
	315	+
	316	+ cfg = Config()
	317	+ #user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
	318	+ user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
	319	+ #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a5834eb2aba6b6f17312239e0761c70")
	320	+ user.filter_pkg_profile(cfg.pkgs_filter)
	321	+ user.maximal_pkg_profile()
	322	+
	323	+ if "content" in sys.argv or len(sys.argv)<2:
	324	+ run_content(user,cfg)
	325	+ if "collaborative" in sys.argv or len(sys.argv)<2:
	326	+ run_collaborative(user,cfg)
	327	+ if "hybrid" in sys.argv or len(sys.argv)<2:
	328	+ run_hybrid(user,cfg)
@@ -0,0 +1,53 @@		@@ -0,0 +1,53 @@
	1	+#! /usr/bin/env python
	2	+"""
	3	+ sample-popcon - extract a sample from popcon population
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import xapian
	23	+import os
	24	+import random
	25	+import sys
	26	+
	27	+def extract_sample(size,popcon,min_profile,max_profile,output):
	28	+ sample = []
	29	+ for n in range(1,popcon.get_doccount()+1):
	30	+ user = popcon.get_document(n)
	31	+ pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")]
	32	+ print len(pkgs_profile)
	33	+ if len(pkgs_profile)>min_profile and len(pkgs_profile)<=max_profile:
	34	+ sample.append(user.get_data())
	35	+ print n,len(sample)
	36	+ if len(sample)==size:
	37	+ break
	38	+ with open(("%s-%d-%d"%(output,min_profile,max_profile)),'w') as f:
	39	+ for s in sample:
	40	+ f.write(s+'\n')
	41	+
	42	+if __name__ == '__main__':
	43	+ popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps"))
	44	+ print ("Popcon repository size: %d" % popcon.get_doccount())
	45	+ try:
	46	+ min_profile = int(sys.argv[1])
	47	+ max_profile = int(sys.argv[2])
	48	+ size = int(sys.argv[3])
	49	+ except:
	50	+ print "Usage: sample-popcon min_profile max_profile sample_size"
	51	+ exit(1)
	52	+ sample_file = "results/misc-popcon/sample"
	53	+ extract_sample(size,popcon,min_profile,max_profile,sample_file)