Renamed files.

Tássia Camões Araújo
1 parent e2be2c33
Showing 3 changed files with 0 additions and 691 deletions Show diff stats
src/experiments/k-suite.py
src/experiments/roc-suite.py
src/experiments/strategies-suite.py
@@ -1,186 +0,0 @@
-#!/usr/bin/env python
-"""
-    k-suite - experiment different neighborhood sizes
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import sys
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-import numpy
-
-def plot_roc(k,roc_points,log_file):
-    g = Gnuplot.Gnuplot()
-    g('set style data points')
-    g.xlabel('False Positive Rate')
-    g.ylabel('True Positive Rate')
-    g('set xrange [0:1.0]')
-    g('set yrange [0:1.0]')
-    g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
-    g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
-           Gnuplot.Data(roc_points))
-    g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
-    g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
-
-def plot_summary(precision,f05,mcc,log_file):
-    g = Gnuplot.Gnuplot()
-    g('set style data lines')
-    g.xlabel('Neighborhood (k)')
-    g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
-    g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
-           Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
-           Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
-    g.hardcopy(log_file+(".png"),terminal="png")
-    g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
-
-class ExperimentResults:
-    def __init__(self,repo_size):
-        self.repository_size = repo_size
-        self.precision = []
-        self.recall = []
-        self.fpr = []
-        self.f05 = []
-        self.mcc = []
-
-    def add_result(self,ranking,sample):
-        predicted = RecommendationResult(dict.fromkeys(ranking,1))
-        real = RecommendationResult(sample)
-        evaluation = Evaluation(predicted,real,self.repository_size)
-        self.precision.append(evaluation.run(Precision()))
-        self.recall.append(evaluation.run(Recall()))
-        self.fpr.append(evaluation.run(FPR()))
-        self.f05.append(evaluation.run(F_score(0.5)))
-        self.mcc.append(evaluation.run(MCC()))
-
-    def get_roc_point(self):
-        tpr = self.recall
-        fpr = self.fpr
-        if not tpr or not fpr:
-            return [0,0]
-        return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
-
-    def get_precision_summary(self):
-        if not self.precision: return 0
-        return  sum(self.precision)/len(self.precision)
-
-    def get_f05_summary(self):
-        if not self.f05: return 0
-        return  sum(self.f05)/len(self.f05)
-
-    def get_mcc_summary(self):
-        if not self.mcc: return 0
-        return  sum(self.mcc)/len(self.mcc)
-
-if __name__ == '__main__':
-    if len(sys.argv)<3:
-        print "Usage: k-suite strategy_str sample_file"
-        exit(1)
-    threshold = 20
-    iterations = 30
-    neighbors = [3,5,10,50,100,150,200,300,400,500]
-    cfg = Config()
-    cfg.strategy = sys.argv[1]
-    sample_file = sys.argv[2]
-    population_sample = []
-    with open(sample_file,'r') as f:
-        for line in f.readlines():
-            user_id = line.strip('\n')
-            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
-    # setup dictionaries and files
-    roc_summary = {}
-    recommended = {}
-    precision_summary = {}
-    f05_summary = {}
-    mcc_summary = {}
-    sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
-    if not os.path.exists(sample_dir):
-        os.makedirs(sample_dir)
-    log_file = os.path.join(sample_dir,cfg.strategy)
-    with open(log_file,'w') as f:
-        f.write("# %s\n\n" % sample_file.split('/')[-1])
-        f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
-                (cfg.strategy,threshold,iterations))
-        f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
-
-    for k in neighbors:
-        roc_summary[k] = []
-        recommended[k] = set()
-        precision_summary[k] = []
-        f05_summary[k] = []
-        mcc_summary[k] = []
-        with open(log_file+"-k%.3d"%k,'w') as f:
-            f.write("# %s\n\n" % sample_file.split('/')[-1])
-            f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
-            f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
-
-    # main loop per user
-    for submission_file in population_sample:
-        user = PopconSystem(submission_file)
-        user.filter_pkg_profile(cfg.pkgs_filter)
-        user.maximal_pkg_profile()
-        for k in neighbors:
-            cfg.k_neighbors = k
-            rec = Recommender(cfg)
-            repo_size = rec.items_repository.get_doccount()
-            results = ExperimentResults(repo_size)
-            # n iterations for same recommender and user
-            for n in range(iterations):
-                # Fill sample profile
-                profile_len = len(user.pkg_profile)
-                item_score = {}
-                for pkg in user.pkg_profile:
-                    item_score[pkg] = user.item_score[pkg]
-                sample = {}
-                sample_size = int(profile_len*0.9)
-                for i in range(sample_size):
-                     key = random.choice(item_score.keys())
-                     sample[key] = item_score.pop(key)
-                iteration_user = User(item_score)
-                recommendation = rec.get_recommendation(iteration_user,threshold)
-                if hasattr(recommendation,"ranking"):
-                    results.add_result(recommendation.ranking,sample)
-                    recommended[k] = recommended[k].union(recommendation.ranking)
-            # save summary
-            roc_point = results.get_roc_point()
-            roc_summary[k].append(roc_point)
-            precision = results.get_precision_summary()
-            precision_summary[k].append(precision)
-            f05 = results.get_f05_summary()
-            f05_summary[k].append(f05)
-            mcc = results.get_mcc_summary()
-            mcc_summary[k].append(mcc)
-            with open(log_file+"-k%.3d"%k,'a') as f:
-                f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
-                        (roc_point[0],roc_point[1],precision,f05,mcc))
-    # back to main flow
-    with open(log_file,'a') as f:
-        plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
-        for k in neighbors:
-            coverage = len(recommended[size])/float(repo_size)
-            f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
-                    (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
-                     float(sum(f05_summary[k]))/len(f05_summary[k]),
-                     float(sum(mcc_summary[k]))/len(mcc_summary[k])))
-            plot_roc(k,roc_summary[k],log_file)
@@ -1,231 +0,0 @@
-#!/usr/bin/env python
-"""
-    recommender suite - recommender experiments suite 
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import sys
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-import numpy
-
-#iterations = 3
-#sample_proportions = [0.9]
-#weighting = [('bm25',1.2)]
-#collaborative = ['knn_eset']
-#content_based = ['cb']
-#hybrid = ['knnco']
-#profile_size = [50,100]
-#popcon_size = ["1000"]
-#neighbors = [50]
-
-iterations = 30
-sample_proportions = [0.9]
-weighting = [('bm25',1.0)]
-content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
-collaborative = ['knn_eset','knn','knn_plus']
-hybrid = ['knnco','knnco_eset']
-profile_size = range(20,200,40)
-neighbors = range(10,510,50)
-
-def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
-    # Write recall log
-    output = open(("%s-%.2d" % (log_file,n)),'w')
-    output.write("# %s-n\n" % label["description"])
-    output.write("# %s-%.2d\n" % (label["values"],n))
-    output.write("\n# repository profile sample\n%d %d %d\n" % \
-                 (repo_size,profile_size,len(sample)))
-    if hasattr(recommendation,"ranking"):
-        notfound = []
-        ranks = []
-        for pkg in sample.keys():
-            if pkg in recommendation.ranking:
-                ranks.append(recommendation.ranking.index(pkg))
-            else:
-                notfound.append(pkg)
-        for r in sorted(ranks):
-            output.write(str(r)+"\n")
-        if notfound:
-            output.write("# out of recommendation:\n")
-            for pkg in notfound:
-                output.write(pkg+"\n")
-    output.close()
-
-def plot_roc(roc_points,eauc,c,p,log_file):
-    g = Gnuplot.Gnuplot()
-    g('set style data lines')
-    g.xlabel('False Positive Rate')
-    g.ylabel('True Positive Rate')
-    g('set xrange [0:1.0]')
-    g('set yrange [0:1.0]')
-    g.title("Setup: %s" % log_file.split("/")[-1])
-    g('set label "C %.2f" at 0.8,0.25' % c)
-    g('set label "P(20) %.2f" at 0.8,0.2' % p)
-    g('set label "AUC %.4f" at 0.8,0.15' % eauc)
-    g.plot(Gnuplot.Data(roc_points,title="ROC"),
-           Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
-           #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
-    g.hardcopy(log_file+"-roc.png",terminal="png")
-    g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
-
-def get_label(cfg,sample_proportion):
-    label = {}
-    if cfg.strategy in content_based:
-        label["description"] = "strategy-profile"
-        label["values"] = ("%s-profile%.3d" %
-                           (cfg.strategy,cfg.profile_size))
-    elif cfg.strategy in collaborative:
-       label["description"] = "strategy-knn"
-       label["values"] = ("%s-k%.3d" %
-                          (cfg.strategy,cfg.k_neighbors))
-    elif cfg.strategy in hybrid:
-       label["description"] = "strategy-knn-profile"
-       label["values"] = ("%s-k%.3d-profile%.3d" %
-                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
-    else:
-        print "Unknown strategy"
-    return label
-
-class ExperimentResults:
-    def __init__(self,repo_size):
-        self.repository_size = repo_size
-        self.precision = {}
-        self.recall = {}
-        self.fpr = {}
-        points = [1]+range(10,self.repository_size,10)
-        self.recommended = set()
-        for size in points:
-            self.precision[size] = []
-            self.recall[size] = []
-            self.fpr[size] = []
-
-    def add_result(self,ranking,sample):
-        self.recommended = self.recommended.union(ranking)
-        # get data only for point
-        for size in self.precision.keys():
-            predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
-            real = RecommendationResult(sample)
-            evaluation = Evaluation(predicted,real,self.repository_size)
-            self.precision[size].append(evaluation.run(Precision()))
-            self.recall[size].append(evaluation.run(Recall()))
-            self.fpr[size].append(evaluation.run(FPR()))
-
-    # Average ROC by threshold (= size of recommendation)
-    def get_roc_points(self):
-        points = []
-        for size in self.recall.keys():
-            tpr = self.recall[size]
-            fpr = self.fpr[size]
-            points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
-        return sorted(points)
-
-def run_strategy(cfg,user):
-    for weight in weighting:
-        cfg.weight = weight[0]
-        cfg.bm25_k1 = weight[1]
-        rec = Recommender(cfg)
-        repo_size = rec.items_repository.get_doccount()
-        for proportion in sample_proportions:
-            results = ExperimentResults(repo_size)
-            label = get_label(cfg,proportion)
-            user_dir = ("results/roc-suite/%s" % user.user_id[:8])
-            if not os.path.exists(user_dir):
-                os.mkdir(user_dir)
-            log_file = os.path.join(user_dir,label["values"])
-            for n in range(iterations):
-                # Fill sample profile
-                profile_len = len(user.pkg_profile)
-                item_score = {}
-                for pkg in user.pkg_profile:
-                    item_score[pkg] = user.item_score[pkg]
-                sample = {}
-                sample_size = int(profile_len*proportion)
-                for i in range(sample_size):
-                     key = random.choice(item_score.keys())
-                     sample[key] = item_score.pop(key)
-                iteration_user = User(item_score)
-                recommendation = rec.get_recommendation(iteration_user,repo_size)
-                write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
-                if hasattr(recommendation,"ranking"):
-                    results.add_result(recommendation.ranking,sample)
-            with open(log_file,'w') as f:
-                roc_points = results.get_roc_points()
-                x_coord = [p[0] for p in roc_points]
-                y_coord = [p[1] for p in roc_points]
-                auc = numpy.trapz(y=y_coord, x=x_coord)
-                eauc = (auc+
-                        numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
-                        numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
-                precision_20 = sum(results.precision[10])/len(results.precision[10])
-                coverage = len(results.recommended)/float(repo_size)
-                f.write("# %s\n# %s\n\n" %
-                        (label["description"],label["values"]))
-                f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
-                        (coverage,precision_20,auc,eauc))
-            plot_roc(roc_points,eauc,coverage,precision_20,log_file)
-
-def run_content(user,cfg):
-    for strategy in content_based:
-        cfg.strategy = strategy
-        for size in profile_size:
-            cfg.profile_size = size
-            run_strategy(cfg,user)
-
-def run_collaborative(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in collaborative:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            run_strategy(cfg,user)
-
-def run_hybrid(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in hybrid:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            for size in profile_size:
-                cfg.profile_size = size
-                run_strategy(cfg,user)
-
-if __name__ == '__main__':
-    if len(sys.argv)<2:
-        print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]"
-        exit(1)
-
-    cfg = Config()
-    user = PopconSystem(sys.argv[1])
-    user.filter_pkg_profile(cfg.pkgs_filter)
-    user.maximal_pkg_profile()
-
-    if "content" in sys.argv or len(sys.argv)<3:
-        run_content(user,cfg)
-    if "collaborative" in sys.argv or len(sys.argv)<3:
-        run_collaborative(user,cfg)
-    if "hybrid" in sys.argv or len(sys.argv)<3:
-        run_hybrid(user,cfg)
@@ -1,274 +0,0 @@
-#!/usr/bin/env python
-"""
-    recommender suite - recommender experiments suite 
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import sys
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-
-#iterations = 3
-#sample_proportions = [0.9]
-#weighting = [('bm25',1.2)]
-#collaborative = ['knn']
-#content_based = []
-#hybrid = ['knnco']
-#profile_size = [50,100]
-#popcon_size = ["1000"]
-#neighbors = [50]
-
-iterations = 10
-sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
-weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
-content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
-collaborative = ['knn_eset','knn','knn_plus']
-hybrid = ['knnco','knnco_eset']
-
-profile_size = range(20,100,20)
-#popcon_size = [1000,10000,50000,'full']
-neighbors = range(10,510,50)
-
-def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
-    # Write recall log
-    output = open(("%s-%d" % (log_file,n)),'w')
-    output.write("# %s-n\n" % label["description"])
-    output.write("# %s-%d\n" % (label["values"],n))
-    output.write("\n%d %d %d\n" % \
-                 (repo_size,profile_size,len(sample)))
-    if hasattr(recommendation,"ranking"):
-        notfound = []
-        ranks = []
-        for pkg in sample.keys():
-            if pkg in recommendation.ranking:
-                ranks.append(recommendation.ranking.index(pkg))
-            else:
-                notfound.append(pkg)
-        for r in sorted(ranks):
-            output.write(str(r)+"\n")
-        if notfound:
-            output.write("Out of recommendation:\n")
-            for pkg in notfound:
-                output.write(pkg+"\n")
-    output.close()
-
-def plot_summary(precision,recall,f1,f05,accuracy,log_file):
-    # Plot metrics summary
-    g = Gnuplot.Gnuplot()
-    g('set style data lines')
-    g.xlabel('Recommendation size')
-    g.title("Setup: %s" % log_file.split("/")[-1])
-    g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
-           Gnuplot.Data(precision,title="Precision"),
-           Gnuplot.Data(recall,title="Recall"),
-           Gnuplot.Data(f1,title="F_1"),
-           Gnuplot.Data(f05,title="F_0.5"))
-    g.hardcopy(log_file+".png",terminal="png")
-    g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
-    g('set logscale x')
-    g('replot')
-    g.hardcopy(log_file+"-logscale.png",terminal="png")
-    g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
-
-def get_label(cfg,sample_proportion):
-    label = {}
-    if cfg.strategy in content_based:
-        label["description"] = "strategy-filter-profile-k1_bm25-sample"
-        label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
-                           (cfg.strategy,cfg.profile_size,
-                            cfg.pkgs_filter.split("/")[-1],
-                            cfg.bm25_k1,sample_proportion))
-    elif cfg.strategy in collaborative:
-       label["description"] = "strategy-knn-filter-k1_bm25-sample"
-       label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
-                          (cfg.strategy,cfg.k_neighbors,
-                           cfg.pkgs_filter.split("/")[-1],
-                           cfg.bm25_k1,sample_proportion))
-    elif cfg.strategy in hybrid:
-       label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
-       label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
-                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
-                           cfg.pkgs_filter.split("/")[-1],
-                           cfg.bm25_k1,sample_proportion))
-    else:
-        print "Unknown strategy"
-    return label
-
-class ExperimentResults:
-    def __init__(self,repo_size):
-        self.repository_size = repo_size
-        self.accuracy = {}
-        self.precision = {}
-        self.recall = {}
-        self.f1 = {}
-        self.f05 = {}
-        points = [1]+range(10,200,10)+range(200,self.repository_size,100)
-        for size in points:
-            self.accuracy[size] = []
-            self.precision[size] = []
-            self.recall[size] = []
-            self.f1[size] = []
-            self.f05[size] = []
-
-    def add_result(self,ranking,sample):
-        for size in self.accuracy.keys():
-            predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
-            real = RecommendationResult(sample)
-            evaluation = Evaluation(predicted,real,self.repository_size)
-            self.accuracy[size].append(evaluation.run(Accuracy()))
-            self.precision[size].append(evaluation.run(Precision()))
-            self.recall[size].append(evaluation.run(Recall()))
-            self.f1[size].append(evaluation.run(F_score(1)))
-            self.f05[size].append(evaluation.run(F_score(0.5)))
-
-    def get_precision_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
-        return sorted(summary)
-
-    def get_recall_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
-        return sorted(summary)
-
-    def get_f1_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
-        return sorted(summary)
-
-    def get_f05_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
-        return sorted(summary)
-
-    def get_accuracy_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
-        return sorted(summary)
-
-    def best_precision(self):
-        size = max(self.precision, key = lambda x: max(self.precision[x]))
-        return (size,max(self.precision[size]))
-
-    def best_f1(self):
-        size = max(self.f1, key = lambda x: max(self.f1[x]))
-        return (size,max(self.f1[size]))
-
-    def best_f05(self):
-        size = max(self.f05, key = lambda x: max(self.f05[x]))
-        return (size,max(self.f05[size]))
-
-def run_strategy(cfg,user):
-    for weight in weighting:
-        cfg.weight = weight[0]
-        cfg.bm25_k1 = weight[1]
-        rec = Recommender(cfg)
-        repo_size = rec.items_repository.get_doccount()
-        for proportion in sample_proportions:
-            results = ExperimentResults(repo_size)
-            label = get_label(cfg,proportion)
-            log_file = "results/strategies/"+label["values"]
-            for n in range(iterations):
-                # Fill sample profile
-                profile_size = len(user.pkg_profile)
-                item_score = {}
-                for pkg in user.pkg_profile:
-                    item_score[pkg] = user.item_score[pkg]
-                sample = {}
-                sample_size = int(profile_size*proportion)
-                for i in range(sample_size):
-                     key = random.choice(item_score.keys())
-                     sample[key] = item_score.pop(key)
-                iteration_user = User(item_score)
-                recommendation = rec.get_recommendation(iteration_user,repo_size)
-                write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
-                if hasattr(recommendation,"ranking"):
-                    results.add_result(recommendation.ranking,sample)
-            with open(log_file,'w') as f:
-                precision_10 = sum(results.precision[10])/len(results.precision[10])
-                f1_10 = sum(results.f1[10])/len(results.f1[10])
-                f05_10 = sum(results.f05[10])/len(results.f05[10])
-                f.write("# %s\n# %s\n\ncoverage %d\n\n" %
-                        (label["description"],label["values"],recommendation.size))
-                f.write("# best results (recommendation size; metric)\n")
-                f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
-                        (results.best_precision()[0],results.best_precision()[1],
-                         results.best_f1()[0],results.best_f1()[1],
-                         results.best_f05()[0],results.best_f05()[1]))
-                f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
-                        (precision_10,f1_10,f05_10))
-            precision = results.get_precision_summary()
-            recall = results.get_recall_summary()
-            f1 = results.get_f1_summary()
-            f05 = results.get_f05_summary()
-            accuracy = results.get_accuracy_summary()
-            plot_summary(precision,recall,f1,f05,accuracy,log_file)
-
-def run_content(user,cfg):
-    for strategy in content_based:
-        cfg.strategy = strategy
-        for size in profile_size:
-            cfg.profile_size = size
-            run_strategy(cfg,user)
-
-def run_collaborative(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in collaborative:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            #for size in popcon_size:
-            #    if size:
-            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
-            #        cfg.popcon_programs = popcon_programs+"_"+size
-            run_strategy(cfg,user)
-
-def run_hybrid(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in hybrid:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            #for size in popcon_size:
-            #    if size:
-            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
-            #        cfg.popcon_programs = popcon_programs+"_"+size
-            for size in profile_size:
-                cfg.profile_size = size
-                run_strategy(cfg,user)
-
-if __name__ == '__main__':
-    #user = LocalSystem()
-    #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
-
-    cfg = Config()
-    user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
-    #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
-    user.filter_pkg_profile(cfg.pkgs_filter)
-    user.maximal_pkg_profile()
-
-    if "content" in sys.argv or len(sys.argv)<2:
-        run_content(user,cfg)
-    if "collaborative" in sys.argv or len(sys.argv)<2:
-        run_collaborative(user,cfg)
-    if "hybrid" in sys.argv or len(sys.argv)<2:
-        run_hybrid(user,cfg)
	@@ -1,186 +0,0 @@	@@ -1,186 +0,0 @@
1	-#!/usr/bin/env python
2	-"""
3	- k-suite - experiment different neighborhood sizes
4	-"""
5	-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6	-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7	-__license__ = """
8	- This program is free software: you can redistribute it and/or modify
9	- it under the terms of the GNU General Public License as published by
10	- the Free Software Foundation, either version 3 of the License, or
11	- (at your option) any later version.
12	-
13	- This program is distributed in the hope that it will be useful,
14	- but WITHOUT ANY WARRANTY; without even the implied warranty of
15	- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	- GNU General Public License for more details.
17	-
18	- You should have received a copy of the GNU General Public License
19	- along with this program. If not, see <http://www.gnu.org/licenses/>.
20	-"""
21	-
22	-import sys
23	-sys.path.insert(0,'../')
24	-from config import Config
25	-from data import PopconXapianIndex, PopconSubmission
26	-from recommender import Recommender
27	-from user import LocalSystem, User
28	-from evaluation import *
29	-import logging
30	-import random
31	-import Gnuplot
32	-import numpy
33	-
34	-def plot_roc(k,roc_points,log_file):
35	- g = Gnuplot.Gnuplot()
36	- g('set style data points')
37	- g.xlabel('False Positive Rate')
38	- g.ylabel('True Positive Rate')
39	- g('set xrange [0:1.0]')
40	- g('set yrange [0:1.0]')
41	- g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
42	- g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
43	- Gnuplot.Data(roc_points))
44	- g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
45	- g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
46	-
47	-def plot_summary(precision,f05,mcc,log_file):
48	- g = Gnuplot.Gnuplot()
49	- g('set style data lines')
50	- g.xlabel('Neighborhood (k)')
51	- g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
52	- g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
53	- Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
54	- Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
55	- g.hardcopy(log_file+(".png"),terminal="png")
56	- g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
57	-
58	-class ExperimentResults:
59	- def __init__(self,repo_size):
60	- self.repository_size = repo_size
61	- self.precision = []
62	- self.recall = []
63	- self.fpr = []
64	- self.f05 = []
65	- self.mcc = []
66	-
67	- def add_result(self,ranking,sample):
68	- predicted = RecommendationResult(dict.fromkeys(ranking,1))
69	- real = RecommendationResult(sample)
70	- evaluation = Evaluation(predicted,real,self.repository_size)
71	- self.precision.append(evaluation.run(Precision()))
72	- self.recall.append(evaluation.run(Recall()))
73	- self.fpr.append(evaluation.run(FPR()))
74	- self.f05.append(evaluation.run(F_score(0.5)))
75	- self.mcc.append(evaluation.run(MCC()))
76	-
77	- def get_roc_point(self):
78	- tpr = self.recall
79	- fpr = self.fpr
80	- if not tpr or not fpr:
81	- return [0,0]
82	- return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
83	-
84	- def get_precision_summary(self):
85	- if not self.precision: return 0
86	- return sum(self.precision)/len(self.precision)
87	-
88	- def get_f05_summary(self):
89	- if not self.f05: return 0
90	- return sum(self.f05)/len(self.f05)
91	-
92	- def get_mcc_summary(self):
93	- if not self.mcc: return 0
94	- return sum(self.mcc)/len(self.mcc)
95	-
96	-if __name__ == '__main__':
97	- if len(sys.argv)<3:
98	- print "Usage: k-suite strategy_str sample_file"
99	- exit(1)
100	- threshold = 20
101	- iterations = 30
102	- neighbors = [3,5,10,50,100,150,200,300,400,500]
103	- cfg = Config()
104	- cfg.strategy = sys.argv[1]
105	- sample_file = sys.argv[2]
106	- population_sample = []
107	- with open(sample_file,'r') as f:
108	- for line in f.readlines():
109	- user_id = line.strip('\n')
110	- population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
111	- # setup dictionaries and files
112	- roc_summary = {}
113	- recommended = {}
114	- precision_summary = {}
115	- f05_summary = {}
116	- mcc_summary = {}
117	- sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
118	- if not os.path.exists(sample_dir):
119	- os.makedirs(sample_dir)
120	- log_file = os.path.join(sample_dir,cfg.strategy)
121	- with open(log_file,'w') as f:
122	- f.write("# %s\n\n" % sample_file.split('/')[-1])
123	- f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
124	- (cfg.strategy,threshold,iterations))
125	- f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
126	-
127	- for k in neighbors:
128	- roc_summary[k] = []
129	- recommended[k] = set()
130	- precision_summary[k] = []
131	- f05_summary[k] = []
132	- mcc_summary[k] = []
133	- with open(log_file+"-k%.3d"%k,'w') as f:
134	- f.write("# %s\n\n" % sample_file.split('/')[-1])
135	- f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
136	- f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
137	-
138	- # main loop per user
139	- for submission_file in population_sample:
140	- user = PopconSystem(submission_file)
141	- user.filter_pkg_profile(cfg.pkgs_filter)
142	- user.maximal_pkg_profile()
143	- for k in neighbors:
144	- cfg.k_neighbors = k
145	- rec = Recommender(cfg)
146	- repo_size = rec.items_repository.get_doccount()
147	- results = ExperimentResults(repo_size)
148	- # n iterations for same recommender and user
149	- for n in range(iterations):
150	- # Fill sample profile
151	- profile_len = len(user.pkg_profile)
152	- item_score = {}
153	- for pkg in user.pkg_profile:
154	- item_score[pkg] = user.item_score[pkg]
155	- sample = {}
156	- sample_size = int(profile_len*0.9)
157	- for i in range(sample_size):
158	- key = random.choice(item_score.keys())
159	- sample[key] = item_score.pop(key)
160	- iteration_user = User(item_score)
161	- recommendation = rec.get_recommendation(iteration_user,threshold)
162	- if hasattr(recommendation,"ranking"):
163	- results.add_result(recommendation.ranking,sample)
164	- recommended[k] = recommended[k].union(recommendation.ranking)
165	- # save summary
166	- roc_point = results.get_roc_point()
167	- roc_summary[k].append(roc_point)
168	- precision = results.get_precision_summary()
169	- precision_summary[k].append(precision)
170	- f05 = results.get_f05_summary()
171	- f05_summary[k].append(f05)
172	- mcc = results.get_mcc_summary()
173	- mcc_summary[k].append(mcc)
174	- with open(log_file+"-k%.3d"%k,'a') as f:
175	- f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
176	- (roc_point[0],roc_point[1],precision,f05,mcc))
177	- # back to main flow
178	- with open(log_file,'a') as f:
179	- plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
180	- for k in neighbors:
181	- coverage = len(recommended[size])/float(repo_size)
182	- f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
183	- (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
184	- float(sum(f05_summary[k]))/len(f05_summary[k]),
185	- float(sum(mcc_summary[k]))/len(mcc_summary[k])))
186	- plot_roc(k,roc_summary[k],log_file)
	@@ -1,231 +0,0 @@	@@ -1,231 +0,0 @@
1	-#!/usr/bin/env python
2	-"""
3	- recommender suite - recommender experiments suite
4	-"""
5	-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6	-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7	-__license__ = """
8	- This program is free software: you can redistribute it and/or modify
9	- it under the terms of the GNU General Public License as published by
10	- the Free Software Foundation, either version 3 of the License, or
11	- (at your option) any later version.
12	-
13	- This program is distributed in the hope that it will be useful,
14	- but WITHOUT ANY WARRANTY; without even the implied warranty of
15	- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	- GNU General Public License for more details.
17	-
18	- You should have received a copy of the GNU General Public License
19	- along with this program. If not, see <http://www.gnu.org/licenses/>.
20	-"""
21	-
22	-import sys
23	-sys.path.insert(0,'../')
24	-from config import Config
25	-from data import PopconXapianIndex, PopconSubmission
26	-from recommender import Recommender
27	-from user import LocalSystem, User
28	-from evaluation import *
29	-import logging
30	-import random
31	-import Gnuplot
32	-import numpy
33	-
34	-#iterations = 3
35	-#sample_proportions = [0.9]
36	-#weighting = [('bm25',1.2)]
37	-#collaborative = ['knn_eset']
38	-#content_based = ['cb']
39	-#hybrid = ['knnco']
40	-#profile_size = [50,100]
41	-#popcon_size = ["1000"]
42	-#neighbors = [50]
43	-
44	-iterations = 30
45	-sample_proportions = [0.9]
46	-weighting = [('bm25',1.0)]
47	-content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
48	-collaborative = ['knn_eset','knn','knn_plus']
49	-hybrid = ['knnco','knnco_eset']
50	-profile_size = range(20,200,40)
51	-neighbors = range(10,510,50)
52	-
53	-def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
54	- # Write recall log
55	- output = open(("%s-%.2d" % (log_file,n)),'w')
56	- output.write("# %s-n\n" % label["description"])
57	- output.write("# %s-%.2d\n" % (label["values"],n))
58	- output.write("\n# repository profile sample\n%d %d %d\n" % \
59	- (repo_size,profile_size,len(sample)))
60	- if hasattr(recommendation,"ranking"):
61	- notfound = []
62	- ranks = []
63	- for pkg in sample.keys():
64	- if pkg in recommendation.ranking:
65	- ranks.append(recommendation.ranking.index(pkg))
66	- else:
67	- notfound.append(pkg)
68	- for r in sorted(ranks):
69	- output.write(str(r)+"\n")
70	- if notfound:
71	- output.write("# out of recommendation:\n")
72	- for pkg in notfound:
73	- output.write(pkg+"\n")
74	- output.close()
75	-
76	-def plot_roc(roc_points,eauc,c,p,log_file):
77	- g = Gnuplot.Gnuplot()
78	- g('set style data lines')
79	- g.xlabel('False Positive Rate')
80	- g.ylabel('True Positive Rate')
81	- g('set xrange [0:1.0]')
82	- g('set yrange [0:1.0]')
83	- g.title("Setup: %s" % log_file.split("/")[-1])
84	- g('set label "C %.2f" at 0.8,0.25' % c)
85	- g('set label "P(20) %.2f" at 0.8,0.2' % p)
86	- g('set label "AUC %.4f" at 0.8,0.15' % eauc)
87	- g.plot(Gnuplot.Data(roc_points,title="ROC"),
88	- Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
89	- #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
90	- g.hardcopy(log_file+"-roc.png",terminal="png")
91	- g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
92	-
93	-def get_label(cfg,sample_proportion):
94	- label = {}
95	- if cfg.strategy in content_based:
96	- label["description"] = "strategy-profile"
97	- label["values"] = ("%s-profile%.3d" %
98	- (cfg.strategy,cfg.profile_size))
99	- elif cfg.strategy in collaborative:
100	- label["description"] = "strategy-knn"
101	- label["values"] = ("%s-k%.3d" %
102	- (cfg.strategy,cfg.k_neighbors))
103	- elif cfg.strategy in hybrid:
104	- label["description"] = "strategy-knn-profile"
105	- label["values"] = ("%s-k%.3d-profile%.3d" %
106	- (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
107	- else:
108	- print "Unknown strategy"
109	- return label
110	-
111	-class ExperimentResults:
112	- def __init__(self,repo_size):
113	- self.repository_size = repo_size
114	- self.precision = {}
115	- self.recall = {}
116	- self.fpr = {}
117	- points = [1]+range(10,self.repository_size,10)
118	- self.recommended = set()
119	- for size in points:
120	- self.precision[size] = []
121	- self.recall[size] = []
122	- self.fpr[size] = []
123	-
124	- def add_result(self,ranking,sample):
125	- self.recommended = self.recommended.union(ranking)
126	- # get data only for point
127	- for size in self.precision.keys():
128	- predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
129	- real = RecommendationResult(sample)
130	- evaluation = Evaluation(predicted,real,self.repository_size)
131	- self.precision[size].append(evaluation.run(Precision()))
132	- self.recall[size].append(evaluation.run(Recall()))
133	- self.fpr[size].append(evaluation.run(FPR()))
134	-
135	- # Average ROC by threshold (= size of recommendation)
136	- def get_roc_points(self):
137	- points = []
138	- for size in self.recall.keys():
139	- tpr = self.recall[size]
140	- fpr = self.fpr[size]
141	- points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
142	- return sorted(points)
143	-
144	-def run_strategy(cfg,user):
145	- for weight in weighting:
146	- cfg.weight = weight[0]
147	- cfg.bm25_k1 = weight[1]
148	- rec = Recommender(cfg)
149	- repo_size = rec.items_repository.get_doccount()
150	- for proportion in sample_proportions:
151	- results = ExperimentResults(repo_size)
152	- label = get_label(cfg,proportion)
153	- user_dir = ("results/roc-suite/%s" % user.user_id[:8])
154	- if not os.path.exists(user_dir):
155	- os.mkdir(user_dir)
156	- log_file = os.path.join(user_dir,label["values"])
157	- for n in range(iterations):
158	- # Fill sample profile
159	- profile_len = len(user.pkg_profile)
160	- item_score = {}
161	- for pkg in user.pkg_profile:
162	- item_score[pkg] = user.item_score[pkg]
163	- sample = {}
164	- sample_size = int(profile_len*proportion)
165	- for i in range(sample_size):
166	- key = random.choice(item_score.keys())
167	- sample[key] = item_score.pop(key)
168	- iteration_user = User(item_score)
169	- recommendation = rec.get_recommendation(iteration_user,repo_size)
170	- write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
171	- if hasattr(recommendation,"ranking"):
172	- results.add_result(recommendation.ranking,sample)
173	- with open(log_file,'w') as f:
174	- roc_points = results.get_roc_points()
175	- x_coord = [p[0] for p in roc_points]
176	- y_coord = [p[1] for p in roc_points]
177	- auc = numpy.trapz(y=y_coord, x=x_coord)
178	- eauc = (auc+
179	- numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
180	- numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
181	- precision_20 = sum(results.precision[10])/len(results.precision[10])
182	- coverage = len(results.recommended)/float(repo_size)
183	- f.write("# %s\n# %s\n\n" %
184	- (label["description"],label["values"]))
185	- f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
186	- (coverage,precision_20,auc,eauc))
187	- plot_roc(roc_points,eauc,coverage,precision_20,log_file)
188	-
189	-def run_content(user,cfg):
190	- for strategy in content_based:
191	- cfg.strategy = strategy
192	- for size in profile_size:
193	- cfg.profile_size = size
194	- run_strategy(cfg,user)
195	-
196	-def run_collaborative(user,cfg):
197	- popcon_desktopapps = cfg.popcon_desktopapps
198	- popcon_programs = cfg.popcon_programs
199	- for strategy in collaborative:
200	- cfg.strategy = strategy
201	- for k in neighbors:
202	- cfg.k_neighbors = k
203	- run_strategy(cfg,user)
204	-
205	-def run_hybrid(user,cfg):
206	- popcon_desktopapps = cfg.popcon_desktopapps
207	- popcon_programs = cfg.popcon_programs
208	- for strategy in hybrid:
209	- cfg.strategy = strategy
210	- for k in neighbors:
211	- cfg.k_neighbors = k
212	- for size in profile_size:
213	- cfg.profile_size = size
214	- run_strategy(cfg,user)
215	-
216	-if __name__ == '__main__':
217	- if len(sys.argv)<2:
218	- print "Usage: roc-suite popcon_submission_path [content\|collaborative\|hybrid]"
219	- exit(1)
220	-
221	- cfg = Config()
222	- user = PopconSystem(sys.argv[1])
223	- user.filter_pkg_profile(cfg.pkgs_filter)
224	- user.maximal_pkg_profile()
225	-
226	- if "content" in sys.argv or len(sys.argv)<3:
227	- run_content(user,cfg)
228	- if "collaborative" in sys.argv or len(sys.argv)<3:
229	- run_collaborative(user,cfg)
230	- if "hybrid" in sys.argv or len(sys.argv)<3:
231	- run_hybrid(user,cfg)
	@@ -1,274 +0,0 @@	@@ -1,274 +0,0 @@
1	-#!/usr/bin/env python
2	-"""
3	- recommender suite - recommender experiments suite
4	-"""
5	-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6	-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7	-__license__ = """
8	- This program is free software: you can redistribute it and/or modify
9	- it under the terms of the GNU General Public License as published by
10	- the Free Software Foundation, either version 3 of the License, or
11	- (at your option) any later version.
12	-
13	- This program is distributed in the hope that it will be useful,
14	- but WITHOUT ANY WARRANTY; without even the implied warranty of
15	- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	- GNU General Public License for more details.
17	-
18	- You should have received a copy of the GNU General Public License
19	- along with this program. If not, see <http://www.gnu.org/licenses/>.
20	-"""
21	-
22	-import sys
23	-sys.path.insert(0,'../')
24	-from config import Config
25	-from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
26	-from recommender import Recommender
27	-from user import LocalSystem, User
28	-from evaluation import *
29	-import logging
30	-import random
31	-import Gnuplot
32	-
33	-#iterations = 3
34	-#sample_proportions = [0.9]
35	-#weighting = [('bm25',1.2)]
36	-#collaborative = ['knn']
37	-#content_based = []
38	-#hybrid = ['knnco']
39	-#profile_size = [50,100]
40	-#popcon_size = ["1000"]
41	-#neighbors = [50]
42	-
43	-iterations = 10
44	-sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
45	-weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
46	-content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
47	-collaborative = ['knn_eset','knn','knn_plus']
48	-hybrid = ['knnco','knnco_eset']
49	-
50	-profile_size = range(20,100,20)
51	-#popcon_size = [1000,10000,50000,'full']
52	-neighbors = range(10,510,50)
53	-
54	-def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
55	- # Write recall log
56	- output = open(("%s-%d" % (log_file,n)),'w')
57	- output.write("# %s-n\n" % label["description"])
58	- output.write("# %s-%d\n" % (label["values"],n))
59	- output.write("\n%d %d %d\n" % \
60	- (repo_size,profile_size,len(sample)))
61	- if hasattr(recommendation,"ranking"):
62	- notfound = []
63	- ranks = []
64	- for pkg in sample.keys():
65	- if pkg in recommendation.ranking:
66	- ranks.append(recommendation.ranking.index(pkg))
67	- else:
68	- notfound.append(pkg)
69	- for r in sorted(ranks):
70	- output.write(str(r)+"\n")
71	- if notfound:
72	- output.write("Out of recommendation:\n")
73	- for pkg in notfound:
74	- output.write(pkg+"\n")
75	- output.close()
76	-
77	-def plot_summary(precision,recall,f1,f05,accuracy,log_file):
78	- # Plot metrics summary
79	- g = Gnuplot.Gnuplot()
80	- g('set style data lines')
81	- g.xlabel('Recommendation size')
82	- g.title("Setup: %s" % log_file.split("/")[-1])
83	- g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
84	- Gnuplot.Data(precision,title="Precision"),
85	- Gnuplot.Data(recall,title="Recall"),
86	- Gnuplot.Data(f1,title="F_1"),
87	- Gnuplot.Data(f05,title="F_0.5"))
88	- g.hardcopy(log_file+".png",terminal="png")
89	- g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
90	- g('set logscale x')
91	- g('replot')
92	- g.hardcopy(log_file+"-logscale.png",terminal="png")
93	- g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
94	-
95	-def get_label(cfg,sample_proportion):
96	- label = {}
97	- if cfg.strategy in content_based:
98	- label["description"] = "strategy-filter-profile-k1_bm25-sample"
99	- label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
100	- (cfg.strategy,cfg.profile_size,
101	- cfg.pkgs_filter.split("/")[-1],
102	- cfg.bm25_k1,sample_proportion))
103	- elif cfg.strategy in collaborative:
104	- label["description"] = "strategy-knn-filter-k1_bm25-sample"
105	- label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
106	- (cfg.strategy,cfg.k_neighbors,
107	- cfg.pkgs_filter.split("/")[-1],
108	- cfg.bm25_k1,sample_proportion))
109	- elif cfg.strategy in hybrid:
110	- label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
111	- label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
112	- (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
113	- cfg.pkgs_filter.split("/")[-1],
114	- cfg.bm25_k1,sample_proportion))
115	- else:
116	- print "Unknown strategy"
117	- return label
118	-
119	-class ExperimentResults:
120	- def __init__(self,repo_size):
121	- self.repository_size = repo_size
122	- self.accuracy = {}
123	- self.precision = {}
124	- self.recall = {}
125	- self.f1 = {}
126	- self.f05 = {}
127	- points = [1]+range(10,200,10)+range(200,self.repository_size,100)
128	- for size in points:
129	- self.accuracy[size] = []
130	- self.precision[size] = []
131	- self.recall[size] = []
132	- self.f1[size] = []
133	- self.f05[size] = []
134	-
135	- def add_result(self,ranking,sample):
136	- for size in self.accuracy.keys():
137	- predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
138	- real = RecommendationResult(sample)
139	- evaluation = Evaluation(predicted,real,self.repository_size)
140	- self.accuracy[size].append(evaluation.run(Accuracy()))
141	- self.precision[size].append(evaluation.run(Precision()))
142	- self.recall[size].append(evaluation.run(Recall()))
143	- self.f1[size].append(evaluation.run(F_score(1)))
144	- self.f05[size].append(evaluation.run(F_score(0.5)))
145	-
146	- def get_precision_summary(self):
147	- summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
148	- return sorted(summary)
149	-
150	- def get_recall_summary(self):
151	- summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
152	- return sorted(summary)
153	-
154	- def get_f1_summary(self):
155	- summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
156	- return sorted(summary)
157	-
158	- def get_f05_summary(self):
159	- summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
160	- return sorted(summary)
161	-
162	- def get_accuracy_summary(self):
163	- summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
164	- return sorted(summary)
165	-
166	- def best_precision(self):
167	- size = max(self.precision, key = lambda x: max(self.precision[x]))
168	- return (size,max(self.precision[size]))
169	-
170	- def best_f1(self):
171	- size = max(self.f1, key = lambda x: max(self.f1[x]))
172	- return (size,max(self.f1[size]))
173	-
174	- def best_f05(self):
175	- size = max(self.f05, key = lambda x: max(self.f05[x]))
176	- return (size,max(self.f05[size]))
177	-
178	-def run_strategy(cfg,user):
179	- for weight in weighting:
180	- cfg.weight = weight[0]
181	- cfg.bm25_k1 = weight[1]
182	- rec = Recommender(cfg)
183	- repo_size = rec.items_repository.get_doccount()
184	- for proportion in sample_proportions:
185	- results = ExperimentResults(repo_size)
186	- label = get_label(cfg,proportion)
187	- log_file = "results/strategies/"+label["values"]
188	- for n in range(iterations):
189	- # Fill sample profile
190	- profile_size = len(user.pkg_profile)
191	- item_score = {}
192	- for pkg in user.pkg_profile:
193	- item_score[pkg] = user.item_score[pkg]
194	- sample = {}
195	- sample_size = int(profile_size*proportion)
196	- for i in range(sample_size):
197	- key = random.choice(item_score.keys())
198	- sample[key] = item_score.pop(key)
199	- iteration_user = User(item_score)
200	- recommendation = rec.get_recommendation(iteration_user,repo_size)
201	- write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
202	- if hasattr(recommendation,"ranking"):
203	- results.add_result(recommendation.ranking,sample)
204	- with open(log_file,'w') as f:
205	- precision_10 = sum(results.precision[10])/len(results.precision[10])
206	- f1_10 = sum(results.f1[10])/len(results.f1[10])
207	- f05_10 = sum(results.f05[10])/len(results.f05[10])
208	- f.write("# %s\n# %s\n\ncoverage %d\n\n" %
209	- (label["description"],label["values"],recommendation.size))
210	- f.write("# best results (recommendation size; metric)\n")
211	- f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
212	- (results.best_precision()[0],results.best_precision()[1],
213	- results.best_f1()[0],results.best_f1()[1],
214	- results.best_f05()[0],results.best_f05()[1]))
215	- f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
216	- (precision_10,f1_10,f05_10))
217	- precision = results.get_precision_summary()
218	- recall = results.get_recall_summary()
219	- f1 = results.get_f1_summary()
220	- f05 = results.get_f05_summary()
221	- accuracy = results.get_accuracy_summary()
222	- plot_summary(precision,recall,f1,f05,accuracy,log_file)
223	-
224	-def run_content(user,cfg):
225	- for strategy in content_based:
226	- cfg.strategy = strategy
227	- for size in profile_size:
228	- cfg.profile_size = size
229	- run_strategy(cfg,user)
230	-
231	-def run_collaborative(user,cfg):
232	- popcon_desktopapps = cfg.popcon_desktopapps
233	- popcon_programs = cfg.popcon_programs
234	- for strategy in collaborative:
235	- cfg.strategy = strategy
236	- for k in neighbors:
237	- cfg.k_neighbors = k
238	- #for size in popcon_size:
239	- # if size:
240	- # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
241	- # cfg.popcon_programs = popcon_programs+"_"+size
242	- run_strategy(cfg,user)
243	-
244	-def run_hybrid(user,cfg):
245	- popcon_desktopapps = cfg.popcon_desktopapps
246	- popcon_programs = cfg.popcon_programs
247	- for strategy in hybrid:
248	- cfg.strategy = strategy
249	- for k in neighbors:
250	- cfg.k_neighbors = k
251	- #for size in popcon_size:
252	- # if size:
253	- # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
254	- # cfg.popcon_programs = popcon_programs+"_"+size
255	- for size in profile_size:
256	- cfg.profile_size = size
257	- run_strategy(cfg,user)
258	-
259	-if __name__ == '__main__':
260	- #user = LocalSystem()
261	- #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
262	-
263	- cfg = Config()
264	- user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
265	- #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
266	- user.filter_pkg_profile(cfg.pkgs_filter)
267	- user.maximal_pkg_profile()
268	-
269	- if "content" in sys.argv or len(sys.argv)<2:
270	- run_content(user,cfg)
271	- if "collaborative" in sys.argv or len(sys.argv)<2:
272	- run_collaborative(user,cfg)
273	- if "hybrid" in sys.argv or len(sys.argv)<2:
274	- run_hybrid(user,cfg)