Renamed files.

Tássia Camões Araújo
1 parent e2be2c33
Showing 3 changed files with 0 additions and 691 deletions Show diff stats
src/experiments/k-suite.py
src/experiments/roc-suite.py
src/experiments/strategies-suite.py
@@ -1,186 +0,0 @@
-#!/usr/bin/env python
-"""
-    k-suite - experiment different neighborhood sizes
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import sys
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-import numpy
-
-def plot_roc(k,roc_points,log_file):
-    g = Gnuplot.Gnuplot()
-    g('set style data points')
-    g.xlabel('False Positive Rate')
-    g.ylabel('True Positive Rate')
-    g('set xrange [0:1.0]')
-    g('set yrange [0:1.0]')
-    g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
-    g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
-           Gnuplot.Data(roc_points))
-    g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
-    g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
-
-def plot_summary(precision,f05,mcc,log_file):
-    g = Gnuplot.Gnuplot()
-    g('set style data lines')
-    g.xlabel('Neighborhood (k)')
-    g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
-    g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
-           Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
-           Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
-    g.hardcopy(log_file+(".png"),terminal="png")
-    g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
-
-class ExperimentResults:
-    def __init__(self,repo_size):
-        self.repository_size = repo_size
-        self.precision = []
-        self.recall = []
-        self.fpr = []
-        self.f05 = []
-        self.mcc = []
-
-    def add_result(self,ranking,sample):
-        predicted = RecommendationResult(dict.fromkeys(ranking,1))
-        real = RecommendationResult(sample)
-        evaluation = Evaluation(predicted,real,self.repository_size)
-        self.precision.append(evaluation.run(Precision()))
-        self.recall.append(evaluation.run(Recall()))
-        self.fpr.append(evaluation.run(FPR()))
-        self.f05.append(evaluation.run(F_score(0.5)))
-        self.mcc.append(evaluation.run(MCC()))
-
-    def get_roc_point(self):
-        tpr = self.recall
-        fpr = self.fpr
-        if not tpr or not fpr:
-            return [0,0]
-        return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
-
-    def get_precision_summary(self):
-        if not self.precision: return 0
-        return  sum(self.precision)/len(self.precision)
-
-    def get_f05_summary(self):
-        if not self.f05: return 0
-        return  sum(self.f05)/len(self.f05)
-
-    def get_mcc_summary(self):
-        if not self.mcc: return 0
-        return  sum(self.mcc)/len(self.mcc)
-
-if __name__ == '__main__':
-    if len(sys.argv)<3:
-        print "Usage: k-suite strategy_str sample_file"
-        exit(1)
-    threshold = 20
-    iterations = 30
-    neighbors = [3,5,10,50,100,150,200,300,400,500]
-    cfg = Config()
-    cfg.strategy = sys.argv[1]
-    sample_file = sys.argv[2]
-    population_sample = []
-    with open(sample_file,'r') as f:
-        for line in f.readlines():
-            user_id = line.strip('\n')
-            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
-    # setup dictionaries and files
-    roc_summary = {}
-    recommended = {}
-    precision_summary = {}
-    f05_summary = {}
-    mcc_summary = {}
-    sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
-    if not os.path.exists(sample_dir):
-        os.makedirs(sample_dir)
-    log_file = os.path.join(sample_dir,cfg.strategy)
-    with open(log_file,'w') as f:
-        f.write("# %s\n\n" % sample_file.split('/')[-1])
-        f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
-                (cfg.strategy,threshold,iterations))
-        f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
-
-    for k in neighbors:
-        roc_summary[k] = []
-        recommended[k] = set()
-        precision_summary[k] = []
-        f05_summary[k] = []
-        mcc_summary[k] = []
-        with open(log_file+"-k%.3d"%k,'w') as f:
-            f.write("# %s\n\n" % sample_file.split('/')[-1])
-            f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
-            f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
-
-    # main loop per user
-    for submission_file in population_sample:
-        user = PopconSystem(submission_file)
-        user.filter_pkg_profile(cfg.pkgs_filter)
-        user.maximal_pkg_profile()
-        for k in neighbors:
-            cfg.k_neighbors = k
-            rec = Recommender(cfg)
-            repo_size = rec.items_repository.get_doccount()
-            results = ExperimentResults(repo_size)
-            # n iterations for same recommender and user
-            for n in range(iterations):
-                # Fill sample profile
-                profile_len = len(user.pkg_profile)
-                item_score = {}
-                for pkg in user.pkg_profile:
-                    item_score[pkg] = user.item_score[pkg]
-                sample = {}
-                sample_size = int(profile_len*0.9)
-                for i in range(sample_size):
-                     key = random.choice(item_score.keys())
-                     sample[key] = item_score.pop(key)
-                iteration_user = User(item_score)
-                recommendation = rec.get_recommendation(iteration_user,threshold)
-                if hasattr(recommendation,"ranking"):
-                    results.add_result(recommendation.ranking,sample)
-                    recommended[k] = recommended[k].union(recommendation.ranking)
-            # save summary
-            roc_point = results.get_roc_point()
-            roc_summary[k].append(roc_point)
-            precision = results.get_precision_summary()
-            precision_summary[k].append(precision)
-            f05 = results.get_f05_summary()
-            f05_summary[k].append(f05)
-            mcc = results.get_mcc_summary()
-            mcc_summary[k].append(mcc)
-            with open(log_file+"-k%.3d"%k,'a') as f:
-                f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
-                        (roc_point[0],roc_point[1],precision,f05,mcc))
-    # back to main flow
-    with open(log_file,'a') as f:
-        plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
-        for k in neighbors:
-            coverage = len(recommended[size])/float(repo_size)
-            f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
-                    (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
-                     float(sum(f05_summary[k]))/len(f05_summary[k]),
-                     float(sum(mcc_summary[k]))/len(mcc_summary[k])))
-            plot_roc(k,roc_summary[k],log_file)
@@ -1,231 +0,0 @@
-#!/usr/bin/env python
-"""
-    recommender suite - recommender experiments suite 
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import sys
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-import numpy
-
-#iterations = 3
-#sample_proportions = [0.9]
-#weighting = [('bm25',1.2)]
-#collaborative = ['knn_eset']
-#content_based = ['cb']
-#hybrid = ['knnco']
-#profile_size = [50,100]
-#popcon_size = ["1000"]
-#neighbors = [50]
-
-iterations = 30
-sample_proportions = [0.9]
-weighting = [('bm25',1.0)]
-content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
-collaborative = ['knn_eset','knn','knn_plus']
-hybrid = ['knnco','knnco_eset']
-profile_size = range(20,200,40)
-neighbors = range(10,510,50)
-
-def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
-    # Write recall log
-    output = open(("%s-%.2d" % (log_file,n)),'w')
-    output.write("# %s-n\n" % label["description"])
-    output.write("# %s-%.2d\n" % (label["values"],n))
-    output.write("\n# repository profile sample\n%d %d %d\n" % \
-                 (repo_size,profile_size,len(sample)))
-    if hasattr(recommendation,"ranking"):
-        notfound = []
-        ranks = []
-        for pkg in sample.keys():
-            if pkg in recommendation.ranking:
-                ranks.append(recommendation.ranking.index(pkg))
-            else:
-                notfound.append(pkg)
-        for r in sorted(ranks):
-            output.write(str(r)+"\n")
-        if notfound:
-            output.write("# out of recommendation:\n")
-            for pkg in notfound:
-                output.write(pkg+"\n")
-    output.close()
-
-def plot_roc(roc_points,eauc,c,p,log_file):
-    g = Gnuplot.Gnuplot()
-    g('set style data lines')
-    g.xlabel('False Positive Rate')
-    g.ylabel('True Positive Rate')
-    g('set xrange [0:1.0]')
-    g('set yrange [0:1.0]')
-    g.title("Setup: %s" % log_file.split("/")[-1])
-    g('set label "C %.2f" at 0.8,0.25' % c)
-    g('set label "P(20) %.2f" at 0.8,0.2' % p)
-    g('set label "AUC %.4f" at 0.8,0.15' % eauc)
-    g.plot(Gnuplot.Data(roc_points,title="ROC"),
-           Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
-           #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
-    g.hardcopy(log_file+"-roc.png",terminal="png")
-    g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
-
-def get_label(cfg,sample_proportion):
-    label = {}
-    if cfg.strategy in content_based:
-        label["description"] = "strategy-profile"
-        label["values"] = ("%s-profile%.3d" %
-                           (cfg.strategy,cfg.profile_size))
-    elif cfg.strategy in collaborative:
-       label["description"] = "strategy-knn"
-       label["values"] = ("%s-k%.3d" %
-                          (cfg.strategy,cfg.k_neighbors))
-    elif cfg.strategy in hybrid:
-       label["description"] = "strategy-knn-profile"
-       label["values"] = ("%s-k%.3d-profile%.3d" %
-                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
-    else:
-        print "Unknown strategy"
-    return label
-
-class ExperimentResults:
-    def __init__(self,repo_size):
-        self.repository_size = repo_size
-        self.precision = {}
-        self.recall = {}
-        self.fpr = {}
-        points = [1]+range(10,self.repository_size,10)
-        self.recommended = set()
-        for size in points:
-            self.precision[size] = []
-            self.recall[size] = []
-            self.fpr[size] = []
-
-    def add_result(self,ranking,sample):
-        self.recommended = self.recommended.union(ranking)
-        # get data only for point
-        for size in self.precision.keys():
-            predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
-            real = RecommendationResult(sample)
-            evaluation = Evaluation(predicted,real,self.repository_size)
-            self.precision[size].append(evaluation.run(Precision()))
-            self.recall[size].append(evaluation.run(Recall()))
-            self.fpr[size].append(evaluation.run(FPR()))
-
-    # Average ROC by threshold (= size of recommendation)
-    def get_roc_points(self):
-        points = []
-        for size in self.recall.keys():
-            tpr = self.recall[size]
-            fpr = self.fpr[size]
-            points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
-        return sorted(points)
-
-def run_strategy(cfg,user):
-    for weight in weighting:
-        cfg.weight = weight[0]
-        cfg.bm25_k1 = weight[1]
-        rec = Recommender(cfg)
-        repo_size = rec.items_repository.get_doccount()
-        for proportion in sample_proportions:
-            results = ExperimentResults(repo_size)
-            label = get_label(cfg,proportion)
-            user_dir = ("results/roc-suite/%s" % user.user_id[:8])
-            if not os.path.exists(user_dir):
-                os.mkdir(user_dir)
-            log_file = os.path.join(user_dir,label["values"])
-            for n in range(iterations):
-                # Fill sample profile
-                profile_len = len(user.pkg_profile)
-                item_score = {}
-                for pkg in user.pkg_profile:
-                    item_score[pkg] = user.item_score[pkg]
-                sample = {}
-                sample_size = int(profile_len*proportion)
-                for i in range(sample_size):
-                     key = random.choice(item_score.keys())
-                     sample[key] = item_score.pop(key)
-                iteration_user = User(item_score)
-                recommendation = rec.get_recommendation(iteration_user,repo_size)
-                write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
-                if hasattr(recommendation,"ranking"):
-                    results.add_result(recommendation.ranking,sample)
-            with open(log_file,'w') as f:
-                roc_points = results.get_roc_points()
-                x_coord = [p[0] for p in roc_points]
-                y_coord = [p[1] for p in roc_points]
-                auc = numpy.trapz(y=y_coord, x=x_coord)
-                eauc = (auc+
-                        numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
-                        numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
-                precision_20 = sum(results.precision[10])/len(results.precision[10])
-                coverage = len(results.recommended)/float(repo_size)
-                f.write("# %s\n# %s\n\n" %
-                        (label["description"],label["values"]))
-                f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
-                        (coverage,precision_20,auc,eauc))
-            plot_roc(roc_points,eauc,coverage,precision_20,log_file)
-
-def run_content(user,cfg):
-    for strategy in content_based:
-        cfg.strategy = strategy
-        for size in profile_size:
-            cfg.profile_size = size
-            run_strategy(cfg,user)
-
-def run_collaborative(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in collaborative:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            run_strategy(cfg,user)
-
-def run_hybrid(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in hybrid:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            for size in profile_size:
-                cfg.profile_size = size
-                run_strategy(cfg,user)
-
-if __name__ == '__main__':
-    if len(sys.argv)<2:
-        print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]"
-        exit(1)
-
-    cfg = Config()
-    user = PopconSystem(sys.argv[1])
-    user.filter_pkg_profile(cfg.pkgs_filter)
-    user.maximal_pkg_profile()
-
-    if "content" in sys.argv or len(sys.argv)<3:
-        run_content(user,cfg)
-    if "collaborative" in sys.argv or len(sys.argv)<3:
-        run_collaborative(user,cfg)
-    if "hybrid" in sys.argv or len(sys.argv)<3:
-        run_hybrid(user,cfg)
@@ -1,274 +0,0 @@
-#!/usr/bin/env python
-"""
-    recommender suite - recommender experiments suite 
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import sys
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-
-#iterations = 3
-#sample_proportions = [0.9]
-#weighting = [('bm25',1.2)]
-#collaborative = ['knn']
-#content_based = []
-#hybrid = ['knnco']
-#profile_size = [50,100]
-#popcon_size = ["1000"]
-#neighbors = [50]
-
-iterations = 10
-sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
-weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
-content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
-collaborative = ['knn_eset','knn','knn_plus']
-hybrid = ['knnco','knnco_eset']
-
-profile_size = range(20,100,20)
-#popcon_size = [1000,10000,50000,'full']
-neighbors = range(10,510,50)
-
-def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
-    # Write recall log
-    output = open(("%s-%d" % (log_file,n)),'w')
-    output.write("# %s-n\n" % label["description"])
-    output.write("# %s-%d\n" % (label["values"],n))
-    output.write("\n%d %d %d\n" % \
-                 (repo_size,profile_size,len(sample)))
-    if hasattr(recommendation,"ranking"):
-        notfound = []
-        ranks = []
-        for pkg in sample.keys():
-            if pkg in recommendation.ranking:
-                ranks.append(recommendation.ranking.index(pkg))
-            else:
-                notfound.append(pkg)
-        for r in sorted(ranks):
-            output.write(str(r)+"\n")
-        if notfound:
-            output.write("Out of recommendation:\n")
-            for pkg in notfound:
-                output.write(pkg+"\n")
-    output.close()
-
-def plot_summary(precision,recall,f1,f05,accuracy,log_file):
-    # Plot metrics summary
-    g = Gnuplot.Gnuplot()
-    g('set style data lines')
-    g.xlabel('Recommendation size')
-    g.title("Setup: %s" % log_file.split("/")[-1])
-    g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
-           Gnuplot.Data(precision,title="Precision"),
-           Gnuplot.Data(recall,title="Recall"),
-           Gnuplot.Data(f1,title="F_1"),
-           Gnuplot.Data(f05,title="F_0.5"))
-    g.hardcopy(log_file+".png",terminal="png")
-    g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
-    g('set logscale x')
-    g('replot')
-    g.hardcopy(log_file+"-logscale.png",terminal="png")
-    g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
-
-def get_label(cfg,sample_proportion):
-    label = {}
-    if cfg.strategy in content_based:
-        label["description"] = "strategy-filter-profile-k1_bm25-sample"
-        label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
-                           (cfg.strategy,cfg.profile_size,
-                            cfg.pkgs_filter.split("/")[-1],
-                            cfg.bm25_k1,sample_proportion))
-    elif cfg.strategy in collaborative:
-       label["description"] = "strategy-knn-filter-k1_bm25-sample"
-       label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
-                          (cfg.strategy,cfg.k_neighbors,
-                           cfg.pkgs_filter.split("/")[-1],
-                           cfg.bm25_k1,sample_proportion))
-    elif cfg.strategy in hybrid:
-       label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
-       label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
-                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
-                           cfg.pkgs_filter.split("/")[-1],
-                           cfg.bm25_k1,sample_proportion))
-    else:
-        print "Unknown strategy"
-    return label
-
-class ExperimentResults:
-    def __init__(self,repo_size):
-        self.repository_size = repo_size
-        self.accuracy = {}
-        self.precision = {}
-        self.recall = {}
-        self.f1 = {}
-        self.f05 = {}
-        points = [1]+range(10,200,10)+range(200,self.repository_size,100)
-        for size in points:
-            self.accuracy[size] = []
-            self.precision[size] = []
-            self.recall[size] = []
-            self.f1[size] = []
-            self.f05[size] = []
-
-    def add_result(self,ranking,sample):
-        for size in self.accuracy.keys():
-            predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
-            real = RecommendationResult(sample)
-            evaluation = Evaluation(predicted,real,self.repository_size)
-            self.accuracy[size].append(evaluation.run(Accuracy()))
-            self.precision[size].append(evaluation.run(Precision()))
-            self.recall[size].append(evaluation.run(Recall()))
-            self.f1[size].append(evaluation.run(F_score(1)))
-            self.f05[size].append(evaluation.run(F_score(0.5)))
-
-    def get_precision_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
-        return sorted(summary)
-
-    def get_recall_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
-        return sorted(summary)
-
-    def get_f1_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
-        return sorted(summary)
-
-    def get_f05_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
-        return sorted(summary)
-
-    def get_accuracy_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
-        return sorted(summary)
-
-    def best_precision(self):
-        size = max(self.precision, key = lambda x: max(self.precision[x]))
-        return (size,max(self.precision[size]))
-
-    def best_f1(self):
-        size = max(self.f1, key = lambda x: max(self.f1[x]))
-        return (size,max(self.f1[size]))
-
-    def best_f05(self):
-        size = max(self.f05, key = lambda x: max(self.f05[x]))
-        return (size,max(self.f05[size]))
-
-def run_strategy(cfg,user):
-    for weight in weighting:
-        cfg.weight = weight[0]
-        cfg.bm25_k1 = weight[1]
-        rec = Recommender(cfg)
-        repo_size = rec.items_repository.get_doccount()
-        for proportion in sample_proportions:
-            results = ExperimentResults(repo_size)
-            label = get_label(cfg,proportion)
-            log_file = "results/strategies/"+label["values"]
-            for n in range(iterations):
-                # Fill sample profile
-                profile_size = len(user.pkg_profile)
-                item_score = {}
-                for pkg in user.pkg_profile:
-                    item_score[pkg] = user.item_score[pkg]
-                sample = {}
-                sample_size = int(profile_size*proportion)
-                for i in range(sample_size):
-                     key = random.choice(item_score.keys())
-                     sample[key] = item_score.pop(key)
-                iteration_user = User(item_score)
-                recommendation = rec.get_recommendation(iteration_user,repo_size)
-                write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
-                if hasattr(recommendation,"ranking"):
-                    results.add_result(recommendation.ranking,sample)
-            with open(log_file,'w') as f:
-                precision_10 = sum(results.precision[10])/len(results.precision[10])
-                f1_10 = sum(results.f1[10])/len(results.f1[10])
-                f05_10 = sum(results.f05[10])/len(results.f05[10])
-                f.write("# %s\n# %s\n\ncoverage %d\n\n" %
-                        (label["description"],label["values"],recommendation.size))
-                f.write("# best results (recommendation size; metric)\n")
-                f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
-                        (results.best_precision()[0],results.best_precision()[1],
-                         results.best_f1()[0],results.best_f1()[1],
-                         results.best_f05()[0],results.best_f05()[1]))
-                f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
-                        (precision_10,f1_10,f05_10))
-            precision = results.get_precision_summary()
-            recall = results.get_recall_summary()
-            f1 = results.get_f1_summary()
-            f05 = results.get_f05_summary()
-            accuracy = results.get_accuracy_summary()
-            plot_summary(precision,recall,f1,f05,accuracy,log_file)
-
-def run_content(user,cfg):
-    for strategy in content_based:
-        cfg.strategy = strategy
-        for size in profile_size:
-            cfg.profile_size = size
-            run_strategy(cfg,user)
-
-def run_collaborative(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in collaborative:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            #for size in popcon_size:
-            #    if size:
-            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
-            #        cfg.popcon_programs = popcon_programs+"_"+size
-            run_strategy(cfg,user)
-
-def run_hybrid(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in hybrid:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            #for size in popcon_size:
-            #    if size:
-            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
-            #        cfg.popcon_programs = popcon_programs+"_"+size
-            for size in profile_size:
-                cfg.profile_size = size
-                run_strategy(cfg,user)
-
-if __name__ == '__main__':
-    #user = LocalSystem()
-    #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
-
-    cfg = Config()
-    user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
-    #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
-    user.filter_pkg_profile(cfg.pkgs_filter)
-    user.maximal_pkg_profile()
-
-    if "content" in sys.argv or len(sys.argv)<2:
-        run_content(user,cfg)
-    if "collaborative" in sys.argv or len(sys.argv)<2:
-        run_collaborative(user,cfg)
-    if "hybrid" in sys.argv or len(sys.argv)<2:
-        run_hybrid(user,cfg)
...	...	@@ -1,186 +0,0 @@
1		-#!/usr/bin/env python
2		-"""
3		- k-suite - experiment different neighborhood sizes
4		-"""
5		-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6		-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7		-__license__ = """
8		- This program is free software: you can redistribute it and/or modify
9		- it under the terms of the GNU General Public License as published by
10		- the Free Software Foundation, either version 3 of the License, or
11		- (at your option) any later version.
12		-
13		- This program is distributed in the hope that it will be useful,
14		- but WITHOUT ANY WARRANTY; without even the implied warranty of
15		- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16		- GNU General Public License for more details.
17		-
18		- You should have received a copy of the GNU General Public License
19		- along with this program. If not, see <http://www.gnu.org/licenses/>.
20		-"""
21		-
22		-import sys
23		-sys.path.insert(0,'../')
24		-from config import Config
25		-from data import PopconXapianIndex, PopconSubmission
26		-from recommender import Recommender
27		-from user import LocalSystem, User
28		-from evaluation import *
29		-import logging
30		-import random
31		-import Gnuplot
32		-import numpy
33		-
34		-def plot_roc(k,roc_points,log_file):
35		- g = Gnuplot.Gnuplot()
36		- g('set style data points')
37		- g.xlabel('False Positive Rate')
38		- g.ylabel('True Positive Rate')
39		- g('set xrange [0:1.0]')
40		- g('set yrange [0:1.0]')
41		- g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
42		- g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
43		- Gnuplot.Data(roc_points))
44		- g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
45		- g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
46		-
47		-def plot_summary(precision,f05,mcc,log_file):
48		- g = Gnuplot.Gnuplot()
49		- g('set style data lines')
50		- g.xlabel('Neighborhood (k)')
51		- g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
52		- g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
53		- Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
54		- Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
55		- g.hardcopy(log_file+(".png"),terminal="png")
56		- g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
57		-
58		-class ExperimentResults:
59		- def __init__(self,repo_size):
60		- self.repository_size = repo_size
61		- self.precision = []
62		- self.recall = []
63		- self.fpr = []
64		- self.f05 = []
65		- self.mcc = []
66		-
67		- def add_result(self,ranking,sample):
68		- predicted = RecommendationResult(dict.fromkeys(ranking,1))
69		- real = RecommendationResult(sample)
70		- evaluation = Evaluation(predicted,real,self.repository_size)
71		- self.precision.append(evaluation.run(Precision()))
72		- self.recall.append(evaluation.run(Recall()))
73		- self.fpr.append(evaluation.run(FPR()))
74		- self.f05.append(evaluation.run(F_score(0.5)))
75		- self.mcc.append(evaluation.run(MCC()))
76		-
77		- def get_roc_point(self):
78		- tpr = self.recall
79		- fpr = self.fpr
80		- if not tpr or not fpr:
81		- return [0,0]
82		- return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
83		-
84		- def get_precision_summary(self):
85		- if not self.precision: return 0
86		- return sum(self.precision)/len(self.precision)
87		-
88		- def get_f05_summary(self):
89		- if not self.f05: return 0
90		- return sum(self.f05)/len(self.f05)
91		-
92		- def get_mcc_summary(self):
93		- if not self.mcc: return 0
94		- return sum(self.mcc)/len(self.mcc)
95		-
96		-if __name__ == '__main__':
97		- if len(sys.argv)<3:
98		- print "Usage: k-suite strategy_str sample_file"
99		- exit(1)
100		- threshold = 20
101		- iterations = 30
102		- neighbors = [3,5,10,50,100,150,200,300,400,500]
103		- cfg = Config()
104		- cfg.strategy = sys.argv[1]
105		- sample_file = sys.argv[2]
106		- population_sample = []
107		- with open(sample_file,'r') as f:
108		- for line in f.readlines():
109		- user_id = line.strip('\n')
110		- population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
111		- # setup dictionaries and files
112		- roc_summary = {}
113		- recommended = {}
114		- precision_summary = {}
115		- f05_summary = {}
116		- mcc_summary = {}
117		- sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
118		- if not os.path.exists(sample_dir):
119		- os.makedirs(sample_dir)
120		- log_file = os.path.join(sample_dir,cfg.strategy)
121		- with open(log_file,'w') as f:
122		- f.write("# %s\n\n" % sample_file.split('/')[-1])
123		- f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
124		- (cfg.strategy,threshold,iterations))
125		- f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
126		-
127		- for k in neighbors:
128		- roc_summary[k] = []
129		- recommended[k] = set()
130		- precision_summary[k] = []
131		- f05_summary[k] = []
132		- mcc_summary[k] = []
133		- with open(log_file+"-k%.3d"%k,'w') as f:
134		- f.write("# %s\n\n" % sample_file.split('/')[-1])
135		- f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
136		- f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
137		-
138		- # main loop per user
139		- for submission_file in population_sample:
140		- user = PopconSystem(submission_file)
141		- user.filter_pkg_profile(cfg.pkgs_filter)
142		- user.maximal_pkg_profile()
143		- for k in neighbors:
144		- cfg.k_neighbors = k
145		- rec = Recommender(cfg)
146		- repo_size = rec.items_repository.get_doccount()
147		- results = ExperimentResults(repo_size)
148		- # n iterations for same recommender and user
149		- for n in range(iterations):
150		- # Fill sample profile
151		- profile_len = len(user.pkg_profile)
152		- item_score = {}
153		- for pkg in user.pkg_profile:
154		- item_score[pkg] = user.item_score[pkg]
155		- sample = {}
156		- sample_size = int(profile_len*0.9)
157		- for i in range(sample_size):
158		- key = random.choice(item_score.keys())
159		- sample[key] = item_score.pop(key)
160		- iteration_user = User(item_score)
161		- recommendation = rec.get_recommendation(iteration_user,threshold)
162		- if hasattr(recommendation,"ranking"):
163		- results.add_result(recommendation.ranking,sample)
164		- recommended[k] = recommended[k].union(recommendation.ranking)
165		- # save summary
166		- roc_point = results.get_roc_point()
167		- roc_summary[k].append(roc_point)
168		- precision = results.get_precision_summary()
169		- precision_summary[k].append(precision)
170		- f05 = results.get_f05_summary()
171		- f05_summary[k].append(f05)
172		- mcc = results.get_mcc_summary()
173		- mcc_summary[k].append(mcc)
174		- with open(log_file+"-k%.3d"%k,'a') as f:
175		- f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
176		- (roc_point[0],roc_point[1],precision,f05,mcc))
177		- # back to main flow
178		- with open(log_file,'a') as f:
179		- plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
180		- for k in neighbors:
181		- coverage = len(recommended[size])/float(repo_size)
182		- f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
183		- (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
184		- float(sum(f05_summary[k]))/len(f05_summary[k]),
185		- float(sum(mcc_summary[k]))/len(mcc_summary[k])))
186		- plot_roc(k,roc_summary[k],log_file)
...	...	@@ -1,231 +0,0 @@
1		-#!/usr/bin/env python
2		-"""
3		- recommender suite - recommender experiments suite
4		-"""
5		-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6		-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7		-__license__ = """
8		- This program is free software: you can redistribute it and/or modify
9		- it under the terms of the GNU General Public License as published by
10		- the Free Software Foundation, either version 3 of the License, or
11		- (at your option) any later version.
12		-
13		- This program is distributed in the hope that it will be useful,
14		- but WITHOUT ANY WARRANTY; without even the implied warranty of
15		- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16		- GNU General Public License for more details.
17		-
18		- You should have received a copy of the GNU General Public License
19		- along with this program. If not, see <http://www.gnu.org/licenses/>.
20		-"""
21		-
22		-import sys
23		-sys.path.insert(0,'../')
24		-from config import Config
25		-from data import PopconXapianIndex, PopconSubmission
26		-from recommender import Recommender
27		-from user import LocalSystem, User
28		-from evaluation import *
29		-import logging
30		-import random
31		-import Gnuplot
32		-import numpy
33		-
34		-#iterations = 3
35		-#sample_proportions = [0.9]
36		-#weighting = [('bm25',1.2)]
37		-#collaborative = ['knn_eset']
38		-#content_based = ['cb']
39		-#hybrid = ['knnco']
40		-#profile_size = [50,100]
41		-#popcon_size = ["1000"]
42		-#neighbors = [50]
43		-
44		-iterations = 30
45		-sample_proportions = [0.9]
46		-weighting = [('bm25',1.0)]
47		-content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
48		-collaborative = ['knn_eset','knn','knn_plus']
49		-hybrid = ['knnco','knnco_eset']
50		-profile_size = range(20,200,40)
51		-neighbors = range(10,510,50)
52		-
53		-def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
54		- # Write recall log
55		- output = open(("%s-%.2d" % (log_file,n)),'w')
56		- output.write("# %s-n\n" % label["description"])
57		- output.write("# %s-%.2d\n" % (label["values"],n))
58		- output.write("\n# repository profile sample\n%d %d %d\n" % \
59		- (repo_size,profile_size,len(sample)))
60		- if hasattr(recommendation,"ranking"):
61		- notfound = []
62		- ranks = []
63		- for pkg in sample.keys():
64		- if pkg in recommendation.ranking:
65		- ranks.append(recommendation.ranking.index(pkg))
66		- else:
67		- notfound.append(pkg)
68		- for r in sorted(ranks):
69		- output.write(str(r)+"\n")
70		- if notfound:
71		- output.write("# out of recommendation:\n")
72		- for pkg in notfound:
73		- output.write(pkg+"\n")
74		- output.close()
75		-
76		-def plot_roc(roc_points,eauc,c,p,log_file):
77		- g = Gnuplot.Gnuplot()
78		- g('set style data lines')
79		- g.xlabel('False Positive Rate')
80		- g.ylabel('True Positive Rate')
81		- g('set xrange [0:1.0]')
82		- g('set yrange [0:1.0]')
83		- g.title("Setup: %s" % log_file.split("/")[-1])
84		- g('set label "C %.2f" at 0.8,0.25' % c)
85		- g('set label "P(20) %.2f" at 0.8,0.2' % p)
86		- g('set label "AUC %.4f" at 0.8,0.15' % eauc)
87		- g.plot(Gnuplot.Data(roc_points,title="ROC"),
88		- Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
89		- #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
90		- g.hardcopy(log_file+"-roc.png",terminal="png")
91		- g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
92		-
93		-def get_label(cfg,sample_proportion):
94		- label = {}
95		- if cfg.strategy in content_based:
96		- label["description"] = "strategy-profile"
97		- label["values"] = ("%s-profile%.3d" %
98		- (cfg.strategy,cfg.profile_size))
99		- elif cfg.strategy in collaborative:
100		- label["description"] = "strategy-knn"
101		- label["values"] = ("%s-k%.3d" %
102		- (cfg.strategy,cfg.k_neighbors))
103		- elif cfg.strategy in hybrid:
104		- label["description"] = "strategy-knn-profile"
105		- label["values"] = ("%s-k%.3d-profile%.3d" %
106		- (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
107		- else:
108		- print "Unknown strategy"
109		- return label
110		-
111		-class ExperimentResults:
112		- def __init__(self,repo_size):
113		- self.repository_size = repo_size
114		- self.precision = {}
115		- self.recall = {}
116		- self.fpr = {}
117		- points = [1]+range(10,self.repository_size,10)
118		- self.recommended = set()
119		- for size in points:
120		- self.precision[size] = []
121		- self.recall[size] = []
122		- self.fpr[size] = []
123		-
124		- def add_result(self,ranking,sample):
125		- self.recommended = self.recommended.union(ranking)
126		- # get data only for point
127		- for size in self.precision.keys():
128		- predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
129		- real = RecommendationResult(sample)
130		- evaluation = Evaluation(predicted,real,self.repository_size)
131		- self.precision[size].append(evaluation.run(Precision()))
132		- self.recall[size].append(evaluation.run(Recall()))
133		- self.fpr[size].append(evaluation.run(FPR()))
134		-
135		- # Average ROC by threshold (= size of recommendation)
136		- def get_roc_points(self):
137		- points = []
138		- for size in self.recall.keys():
139		- tpr = self.recall[size]
140		- fpr = self.fpr[size]
141		- points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
142		- return sorted(points)
143		-
144		-def run_strategy(cfg,user):
145		- for weight in weighting:
146		- cfg.weight = weight[0]
147		- cfg.bm25_k1 = weight[1]
148		- rec = Recommender(cfg)
149		- repo_size = rec.items_repository.get_doccount()
150		- for proportion in sample_proportions:
151		- results = ExperimentResults(repo_size)
152		- label = get_label(cfg,proportion)
153		- user_dir = ("results/roc-suite/%s" % user.user_id[:8])
154		- if not os.path.exists(user_dir):
155		- os.mkdir(user_dir)
156		- log_file = os.path.join(user_dir,label["values"])
157		- for n in range(iterations):
158		- # Fill sample profile
159		- profile_len = len(user.pkg_profile)
160		- item_score = {}
161		- for pkg in user.pkg_profile:
162		- item_score[pkg] = user.item_score[pkg]
163		- sample = {}
164		- sample_size = int(profile_len*proportion)
165		- for i in range(sample_size):
166		- key = random.choice(item_score.keys())
167		- sample[key] = item_score.pop(key)
168		- iteration_user = User(item_score)
169		- recommendation = rec.get_recommendation(iteration_user,repo_size)
170		- write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
171		- if hasattr(recommendation,"ranking"):
172		- results.add_result(recommendation.ranking,sample)
173		- with open(log_file,'w') as f:
174		- roc_points = results.get_roc_points()
175		- x_coord = [p[0] for p in roc_points]
176		- y_coord = [p[1] for p in roc_points]
177		- auc = numpy.trapz(y=y_coord, x=x_coord)
178		- eauc = (auc+
179		- numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
180		- numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
181		- precision_20 = sum(results.precision[10])/len(results.precision[10])
182		- coverage = len(results.recommended)/float(repo_size)
183		- f.write("# %s\n# %s\n\n" %
184		- (label["description"],label["values"]))
185		- f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
186		- (coverage,precision_20,auc,eauc))
187		- plot_roc(roc_points,eauc,coverage,precision_20,log_file)
188		-
189		-def run_content(user,cfg):
190		- for strategy in content_based:
191		- cfg.strategy = strategy
192		- for size in profile_size:
193		- cfg.profile_size = size
194		- run_strategy(cfg,user)
195		-
196		-def run_collaborative(user,cfg):
197		- popcon_desktopapps = cfg.popcon_desktopapps
198		- popcon_programs = cfg.popcon_programs
199		- for strategy in collaborative:
200		- cfg.strategy = strategy
201		- for k in neighbors:
202		- cfg.k_neighbors = k
203		- run_strategy(cfg,user)
204		-
205		-def run_hybrid(user,cfg):
206		- popcon_desktopapps = cfg.popcon_desktopapps
207		- popcon_programs = cfg.popcon_programs
208		- for strategy in hybrid:
209		- cfg.strategy = strategy
210		- for k in neighbors:
211		- cfg.k_neighbors = k
212		- for size in profile_size:
213		- cfg.profile_size = size
214		- run_strategy(cfg,user)
215		-
216		-if __name__ == '__main__':
217		- if len(sys.argv)<2:
218		- print "Usage: roc-suite popcon_submission_path [content\|collaborative\|hybrid]"
219		- exit(1)
220		-
221		- cfg = Config()
222		- user = PopconSystem(sys.argv[1])
223		- user.filter_pkg_profile(cfg.pkgs_filter)
224		- user.maximal_pkg_profile()
225		-
226		- if "content" in sys.argv or len(sys.argv)<3:
227		- run_content(user,cfg)
228		- if "collaborative" in sys.argv or len(sys.argv)<3:
229		- run_collaborative(user,cfg)
230		- if "hybrid" in sys.argv or len(sys.argv)<3:
231		- run_hybrid(user,cfg)
...	...	@@ -1,274 +0,0 @@
1		-#!/usr/bin/env python
2		-"""
3		- recommender suite - recommender experiments suite
4		-"""
5		-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6		-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7		-__license__ = """
8		- This program is free software: you can redistribute it and/or modify
9		- it under the terms of the GNU General Public License as published by
10		- the Free Software Foundation, either version 3 of the License, or
11		- (at your option) any later version.
12		-
13		- This program is distributed in the hope that it will be useful,
14		- but WITHOUT ANY WARRANTY; without even the implied warranty of
15		- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16		- GNU General Public License for more details.
17		-
18		- You should have received a copy of the GNU General Public License
19		- along with this program. If not, see <http://www.gnu.org/licenses/>.
20		-"""
21		-
22		-import sys
23		-sys.path.insert(0,'../')
24		-from config import Config
25		-from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
26		-from recommender import Recommender
27		-from user import LocalSystem, User
28		-from evaluation import *
29		-import logging
30		-import random
31		-import Gnuplot
32		-
33		-#iterations = 3
34		-#sample_proportions = [0.9]
35		-#weighting = [('bm25',1.2)]
36		-#collaborative = ['knn']
37		-#content_based = []
38		-#hybrid = ['knnco']
39		-#profile_size = [50,100]
40		-#popcon_size = ["1000"]
41		-#neighbors = [50]
42		-
43		-iterations = 10
44		-sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
45		-weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
46		-content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
47		-collaborative = ['knn_eset','knn','knn_plus']
48		-hybrid = ['knnco','knnco_eset']
49		-
50		-profile_size = range(20,100,20)
51		-#popcon_size = [1000,10000,50000,'full']
52		-neighbors = range(10,510,50)
53		-
54		-def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
55		- # Write recall log
56		- output = open(("%s-%d" % (log_file,n)),'w')
57		- output.write("# %s-n\n" % label["description"])
58		- output.write("# %s-%d\n" % (label["values"],n))
59		- output.write("\n%d %d %d\n" % \
60		- (repo_size,profile_size,len(sample)))
61		- if hasattr(recommendation,"ranking"):
62		- notfound = []
63		- ranks = []
64		- for pkg in sample.keys():
65		- if pkg in recommendation.ranking:
66		- ranks.append(recommendation.ranking.index(pkg))
67		- else:
68		- notfound.append(pkg)
69		- for r in sorted(ranks):
70		- output.write(str(r)+"\n")
71		- if notfound:
72		- output.write("Out of recommendation:\n")
73		- for pkg in notfound:
74		- output.write(pkg+"\n")
75		- output.close()
76		-
77		-def plot_summary(precision,recall,f1,f05,accuracy,log_file):
78		- # Plot metrics summary
79		- g = Gnuplot.Gnuplot()
80		- g('set style data lines')
81		- g.xlabel('Recommendation size')
82		- g.title("Setup: %s" % log_file.split("/")[-1])
83		- g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
84		- Gnuplot.Data(precision,title="Precision"),
85		- Gnuplot.Data(recall,title="Recall"),
86		- Gnuplot.Data(f1,title="F_1"),
87		- Gnuplot.Data(f05,title="F_0.5"))
88		- g.hardcopy(log_file+".png",terminal="png")
89		- g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
90		- g('set logscale x')
91		- g('replot')
92		- g.hardcopy(log_file+"-logscale.png",terminal="png")
93		- g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
94		-
95		-def get_label(cfg,sample_proportion):
96		- label = {}
97		- if cfg.strategy in content_based:
98		- label["description"] = "strategy-filter-profile-k1_bm25-sample"
99		- label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
100		- (cfg.strategy,cfg.profile_size,
101		- cfg.pkgs_filter.split("/")[-1],
102		- cfg.bm25_k1,sample_proportion))
103		- elif cfg.strategy in collaborative:
104		- label["description"] = "strategy-knn-filter-k1_bm25-sample"
105		- label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
106		- (cfg.strategy,cfg.k_neighbors,
107		- cfg.pkgs_filter.split("/")[-1],
108		- cfg.bm25_k1,sample_proportion))
109		- elif cfg.strategy in hybrid:
110		- label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
111		- label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
112		- (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
113		- cfg.pkgs_filter.split("/")[-1],
114		- cfg.bm25_k1,sample_proportion))
115		- else:
116		- print "Unknown strategy"
117		- return label
118		-
119		-class ExperimentResults:
120		- def __init__(self,repo_size):
121		- self.repository_size = repo_size
122		- self.accuracy = {}
123		- self.precision = {}
124		- self.recall = {}
125		- self.f1 = {}
126		- self.f05 = {}
127		- points = [1]+range(10,200,10)+range(200,self.repository_size,100)
128		- for size in points:
129		- self.accuracy[size] = []
130		- self.precision[size] = []
131		- self.recall[size] = []
132		- self.f1[size] = []
133		- self.f05[size] = []
134		-
135		- def add_result(self,ranking,sample):
136		- for size in self.accuracy.keys():
137		- predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
138		- real = RecommendationResult(sample)
139		- evaluation = Evaluation(predicted,real,self.repository_size)
140		- self.accuracy[size].append(evaluation.run(Accuracy()))
141		- self.precision[size].append(evaluation.run(Precision()))
142		- self.recall[size].append(evaluation.run(Recall()))
143		- self.f1[size].append(evaluation.run(F_score(1)))
144		- self.f05[size].append(evaluation.run(F_score(0.5)))
145		-
146		- def get_precision_summary(self):
147		- summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
148		- return sorted(summary)
149		-
150		- def get_recall_summary(self):
151		- summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
152		- return sorted(summary)
153		-
154		- def get_f1_summary(self):
155		- summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
156		- return sorted(summary)
157		-
158		- def get_f05_summary(self):
159		- summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
160		- return sorted(summary)
161		-
162		- def get_accuracy_summary(self):
163		- summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
164		- return sorted(summary)
165		-
166		- def best_precision(self):
167		- size = max(self.precision, key = lambda x: max(self.precision[x]))
168		- return (size,max(self.precision[size]))
169		-
170		- def best_f1(self):
171		- size = max(self.f1, key = lambda x: max(self.f1[x]))
172		- return (size,max(self.f1[size]))
173		-
174		- def best_f05(self):
175		- size = max(self.f05, key = lambda x: max(self.f05[x]))
176		- return (size,max(self.f05[size]))
177		-
178		-def run_strategy(cfg,user):
179		- for weight in weighting:
180		- cfg.weight = weight[0]
181		- cfg.bm25_k1 = weight[1]
182		- rec = Recommender(cfg)
183		- repo_size = rec.items_repository.get_doccount()
184		- for proportion in sample_proportions:
185		- results = ExperimentResults(repo_size)
186		- label = get_label(cfg,proportion)
187		- log_file = "results/strategies/"+label["values"]
188		- for n in range(iterations):
189		- # Fill sample profile
190		- profile_size = len(user.pkg_profile)
191		- item_score = {}
192		- for pkg in user.pkg_profile:
193		- item_score[pkg] = user.item_score[pkg]
194		- sample = {}
195		- sample_size = int(profile_size*proportion)
196		- for i in range(sample_size):
197		- key = random.choice(item_score.keys())
198		- sample[key] = item_score.pop(key)
199		- iteration_user = User(item_score)
200		- recommendation = rec.get_recommendation(iteration_user,repo_size)
201		- write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
202		- if hasattr(recommendation,"ranking"):
203		- results.add_result(recommendation.ranking,sample)
204		- with open(log_file,'w') as f:
205		- precision_10 = sum(results.precision[10])/len(results.precision[10])
206		- f1_10 = sum(results.f1[10])/len(results.f1[10])
207		- f05_10 = sum(results.f05[10])/len(results.f05[10])
208		- f.write("# %s\n# %s\n\ncoverage %d\n\n" %
209		- (label["description"],label["values"],recommendation.size))
210		- f.write("# best results (recommendation size; metric)\n")
211		- f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
212		- (results.best_precision()[0],results.best_precision()[1],
213		- results.best_f1()[0],results.best_f1()[1],
214		- results.best_f05()[0],results.best_f05()[1]))
215		- f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
216		- (precision_10,f1_10,f05_10))
217		- precision = results.get_precision_summary()
218		- recall = results.get_recall_summary()
219		- f1 = results.get_f1_summary()
220		- f05 = results.get_f05_summary()
221		- accuracy = results.get_accuracy_summary()
222		- plot_summary(precision,recall,f1,f05,accuracy,log_file)
223		-
224		-def run_content(user,cfg):
225		- for strategy in content_based:
226		- cfg.strategy = strategy
227		- for size in profile_size:
228		- cfg.profile_size = size
229		- run_strategy(cfg,user)
230		-
231		-def run_collaborative(user,cfg):
232		- popcon_desktopapps = cfg.popcon_desktopapps
233		- popcon_programs = cfg.popcon_programs
234		- for strategy in collaborative:
235		- cfg.strategy = strategy
236		- for k in neighbors:
237		- cfg.k_neighbors = k
238		- #for size in popcon_size:
239		- # if size:
240		- # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
241		- # cfg.popcon_programs = popcon_programs+"_"+size
242		- run_strategy(cfg,user)
243		-
244		-def run_hybrid(user,cfg):
245		- popcon_desktopapps = cfg.popcon_desktopapps
246		- popcon_programs = cfg.popcon_programs
247		- for strategy in hybrid:
248		- cfg.strategy = strategy
249		- for k in neighbors:
250		- cfg.k_neighbors = k
251		- #for size in popcon_size:
252		- # if size:
253		- # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
254		- # cfg.popcon_programs = popcon_programs+"_"+size
255		- for size in profile_size:
256		- cfg.profile_size = size
257		- run_strategy(cfg,user)
258		-
259		-if __name__ == '__main__':
260		- #user = LocalSystem()
261		- #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
262		-
263		- cfg = Config()
264		- user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
265		- #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
266		- user.filter_pkg_profile(cfg.pkgs_filter)
267		- user.maximal_pkg_profile()
268		-
269		- if "content" in sys.argv or len(sys.argv)<2:
270		- run_content(user,cfg)
271		- if "collaborative" in sys.argv or len(sys.argv)<2:
272		- run_collaborative(user,cfg)
273		- if "hybrid" in sys.argv or len(sys.argv)<2:
274		- run_hybrid(user,cfg)