Merge branch 'master' of https://github.com/tassia/AppRecommender

Tássia Camões Araújo
2 parents ef8c9733 b33c0cb1
Showing 25 changed files with 1589 additions and 1230 deletions Show diff stats
README
src/bin/cross_validation.py
src/bin/get_axipkgs.py
src/bin/get_desktop.sh
src/bin/get_pkgs_inst.py
src/config.py
src/evaluation.py
src/experiments/README
src/experiments/deprecated/k-suite.py
src/experiments/deprecated/strategies-suite.py
src/experiments/experiments.cfg
src/experiments/extract-sample-db.py
src/experiments/hybrid.py
src/experiments/k-suite.py
src/experiments/legacy/clustering-suite.py
src/experiments/legacy/experiments.cfg
src/experiments/legacy/runner.py
src/experiments/pure.py
src/experiments/roc-sample.py
src/experiments/roc-single.py
@@ -6,10 +6,7 @@ Install dependencies
 # apt-get install \
 python python-xapian python-apt python-cluster python-webpy python-simplejson \
-python-unittest2 python-numpy python-gnuplot \
-apt-xapian-index gnuplot
-
-# cd ./src; git clone https://github.com/rueckstiess/expsuite
+python-numpy apt-xapian-index app-install-data python-xdg
 Run AppRecommender web UI
@@ -20,4 +17,5 @@ Run AppRecommender web UI
 Open a browser and access http://localhost:8080
+
 More info at https://github.com/tassia/AppRecommender/wiki
@@ -37,7 +37,7 @@ if __name__ == &#39;__main__&#39;:
     #user = LocalSystem()
     #user = RandomPopcon(cfg.popcon_dir)
     #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
-    user = PopconSystem("/home/tassia/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
+    user = PopconSystem(os.path.expanduser("~/.app-recommender/popcon-entries/00/0001166d0737c6dffb083071e5ee69f5"))
     user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps"))
     user.maximal_pkg_profile()
     begin_time = datetime.datetime.now()
@@ -48,7 +48,7 @@ if __name__ == &#39;__main__&#39;:
     metrics.append(F_score(0.5))
     metrics.append(Accuracy())
     metrics.append(FPR())
-    validation = CrossValidation(0.9,10,rec,metrics,1)
+    validation = CrossValidation(0.9,20,rec,metrics,0.005)
     validation.run(user)
     print validation
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+"""
+    AppRecommender - A GNU/Linux application recommender
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import os
+import sys
+sys.path.insert(0,'../')
+import xapian
+
+if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: get_axipkgs index_path"
+        exit(1)
+
+    axi_path = sys.argv[1]
+    axi = xapian.Database(axi_path)
+    for n in range(1,axi.get_lastdocid()):
+        doc = 0
+        try:
+            doc = axi.get_document(n)
+        except:
+            pass
+        if doc:
+            xp_terms = [t.term for t in doc.termlist() if t.term.startswith("XP")]
+            print xp_terms[0].lstrip('XP')
 #!/usr/bin/env bash
 #
-# get_desktop.sh - get packages which have desktop files 
+# get_desktop.sh - get packages which have desktop files
+#
+# DEPRECATED: use get_axipkgs.py to get this info from axi
 cd /usr/share/app-install/desktop
 sed -ne 's/X-AppInstall-Package=//p' * | sort -u | grep -v kdelibs | grep -v libfm-gtk0
 #!/usr/bin/env python
 #
 # get_pkgs_inst.py - get tuple (package,installation) from popcon results file
+#
+# results_file: org/popcon.debian.org/popcon-mail/results
+import sys
 from operator import itemgetter
+
 if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: get_pkgs_inst popcon_results_path"
+        exit(1)
+
+    results_path = sys.argv[1]
     pkgs_inst = {}
-    with open("/root/org/popcon.debian.org/popcon-mail/results") as results:
+    with open(results_path) as results:
         for line in results:
             if line.startswith("Package"):
                 fields = line.split()
                 inst = int(fields[2])+int(fields[3])+int(fields[4])
-                if inst > 20:
-                    pkgs_inst[fields[1]] = inst
+                pkgs_inst[fields[1]] = inst
     sorted_by_inst = sorted(pkgs_inst.items(), key=itemgetter(1))
     for pkg, inst in sorted_by_inst:
         print pkg, inst
@@ -40,7 +40,7 @@ class Config(Singleton):
             ## general options
             self.debug = 0
             self.verbose = 1
-            self.output = "log"
+            self.output = "apprec.log"
             ## data_source options
             self.base_dir = os.path.expanduser("/home/tiago/.app-recommender/")
@@ -103,13 +103,14 @@ class Config(Singleton):
         print "  -f, --filtersdir=PATH      Path to filters directory"
         print "  -b, --pkgsfilter=FILTER    File containing packages to be considered for recommendations"
         print "  -a, --axi=PATH             Path to apt-xapian-index"
-        print "  -e, --dde=URL              DDE url"
         print "  -p, --popconindex=PATH     Path to popcon index"
-        print "  -m, --popcondir=PATH       Path to popcon submissions dir"
-        print "  -u, --indexmode=MODE       'old'|'reindex'|'cluster'|'recluster'"
-        print "  -l, --clustersdir=PATH     Path to popcon clusters dir"
-        print "  -c, --medoids=k            Number of medoids for clustering"
-        print "  -x, --maxpopcon=k          Number of submissions to be considered"
+        print "  -e, --dde=URL              DDE url"
+        # deprecated options
+        #print "  -m, --popcondir=PATH       Path to popcon submissions dir"
+        #print "  -u, --indexmode=MODE       'old'|'reindex'|'cluster'|'recluster'"
+        #print "  -l, --clustersdir=PATH     Path to popcon clusters dir"
+        #print "  -c, --medoids=k            Number of medoids for clustering"
+        #print "  -x, --maxpopcon=k          Number of submissions to be considered"
         print ""
         print " [ recommender ]"
         print "  -w, --weight=OPTION        Search weighting scheme"
@@ -123,11 +124,19 @@ class Config(Singleton):
         print "  bm25 = bm25 weighting scheme"
         print ""
         print " [ strategy options ] "
-        print "  cb = content-based "
-        print "  cbt = content-based using only tags as content "
-        print "  cbd = content-based using only package descriptions as content "
-        print "  col = collaborative "
-        print "  colct = collaborative through tags content "
+        print "  cb = content-based, mixed profile"
+        print "  cbt = content-based, tags only profile"
+        print "  cbd = content-based, description terms only profile"
+        print "  cbh = content-based, half-half profile"
+        print "  cb_eset = cb with eset profiling"
+        print "  cbt_eset = cbt with eset profiling"
+        print "  cbd_eset = cbd_eset with eset profiling"
+        print "  cbh_eset = cbh with eset profiling"
+        print "  knn = collaborative, tf-idf knn"
+        print "  knn_plus = collaborative, tf-idf weighted knn"
+        print "  knn_eset = collaborative, eset knn"
+        print "  knnco = collaborative through content"
+        print "  knnco_eset = collaborative through content, eset recommendation"
     def read_option(self, section, option):
         """
@@ -140,6 +140,29 @@ class FPR(Metric):
         return (float(len(evaluation.false_positive))/
                 evaluation.real_negative_len)
+class MCC(Metric):
+    """
+    Matthews correlation coefficient.
+    """
+    def __init__(self):
+        """
+        Set metric description.
+        """
+        self.desc = "    MCC    "
+
+    def run(self,evaluation):
+        """
+        Compute metric.
+        """
+        VP = len(evaluation.true_positive)
+        FP = len(evaluation.false_positive)
+        FN = len(evaluation.false_negative)
+        VN = evaluation.true_negative_len
+        if (VP+FP)==0 or (VP+FN)==0 or (VN+FP)==0 or (VN+FN)==0:
+            return 0
+        MCC = (((VP*VN)-(FP*FN))/math.sqrt((VP+FP)*(VP+FN)*(VN+FP)*(VN+FN)))
+        return MCC
+
 class F_score(Metric):
     """
     Classification accuracy metric which correlates precision and recall into an
-Experiments handled by expsuite:
-https://github.com/rueckstiess/expsuite
+AppRecommender experiments and tests
+---------------------------------------
+
+Install dependencies:
+
+# apt-get install \
+python-unittest2 python-gnuplot gnuplot
+
+# cd ./src; git clone https://github.com/rueckstiess/expsuite (deprecated tests)
@@ -0,0 +1,186 @@
+#!/usr/bin/env python
+"""
+    k-suite - experiment different neighborhood sizes
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+
+def plot_roc(k,roc_points,log_file):
+    g = Gnuplot.Gnuplot()
+    g('set style data points')
+    g.xlabel('False Positive Rate')
+    g.ylabel('True Positive Rate')
+    g('set xrange [0:1.0]')
+    g('set yrange [0:1.0]')
+    g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
+    g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
+           Gnuplot.Data(roc_points))
+    g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
+    g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
+
+def plot_summary(precision,f05,mcc,log_file):
+    g = Gnuplot.Gnuplot()
+    g('set style data lines')
+    g.xlabel('Neighborhood (k)')
+    g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
+    g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
+           Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
+           Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
+    g.hardcopy(log_file+(".png"),terminal="png")
+    g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
+
+class ExperimentResults:
+    def __init__(self,repo_size):
+        self.repository_size = repo_size
+        self.precision = []
+        self.recall = []
+        self.fpr = []
+        self.f05 = []
+        self.mcc = []
+
+    def add_result(self,ranking,sample):
+        predicted = RecommendationResult(dict.fromkeys(ranking,1))
+        real = RecommendationResult(sample)
+        evaluation = Evaluation(predicted,real,self.repository_size)
+        self.precision.append(evaluation.run(Precision()))
+        self.recall.append(evaluation.run(Recall()))
+        self.fpr.append(evaluation.run(FPR()))
+        self.f05.append(evaluation.run(F_score(0.5)))
+        self.mcc.append(evaluation.run(MCC()))
+
+    def get_roc_point(self):
+        tpr = self.recall
+        fpr = self.fpr
+        if not tpr or not fpr:
+            return [0,0]
+        return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
+
+    def get_precision_summary(self):
+        if not self.precision: return 0
+        return  sum(self.precision)/len(self.precision)
+
+    def get_f05_summary(self):
+        if not self.f05: return 0
+        return  sum(self.f05)/len(self.f05)
+
+    def get_mcc_summary(self):
+        if not self.mcc: return 0
+        return  sum(self.mcc)/len(self.mcc)
+
+if __name__ == '__main__':
+    if len(sys.argv)<3:
+        print "Usage: k-suite strategy_str sample_file"
+        exit(1)
+    threshold = 20
+    iterations = 30
+    neighbors = [3,5,10,50,100,150,200,300,400,500]
+    cfg = Config()
+    cfg.strategy = sys.argv[1]
+    sample_file = sys.argv[2]
+    population_sample = []
+    with open(sample_file,'r') as f:
+        for line in f.readlines():
+            user_id = line.strip('\n')
+            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
+    # setup dictionaries and files
+    roc_summary = {}
+    recommended = {}
+    precision_summary = {}
+    f05_summary = {}
+    mcc_summary = {}
+    sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
+    if not os.path.exists(sample_dir):
+        os.makedirs(sample_dir)
+    log_file = os.path.join(sample_dir,cfg.strategy)
+    with open(log_file,'w') as f:
+        f.write("# %s\n\n" % sample_file.split('/')[-1])
+        f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
+                (cfg.strategy,threshold,iterations))
+        f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
+
+    for k in neighbors:
+        roc_summary[k] = []
+        recommended[k] = set()
+        precision_summary[k] = []
+        f05_summary[k] = []
+        mcc_summary[k] = []
+        with open(log_file+"-k%.3d"%k,'w') as f:
+            f.write("# %s\n\n" % sample_file.split('/')[-1])
+            f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
+            f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
+
+    # main loop per user
+    for submission_file in population_sample:
+        user = PopconSystem(submission_file)
+        user.filter_pkg_profile(cfg.pkgs_filter)
+        user.maximal_pkg_profile()
+        for k in neighbors:
+            cfg.k_neighbors = k
+            rec = Recommender(cfg)
+            repo_size = rec.items_repository.get_doccount()
+            results = ExperimentResults(repo_size)
+            # n iterations for same recommender and user
+            for n in range(iterations):
+                # Fill sample profile
+                profile_len = len(user.pkg_profile)
+                item_score = {}
+                for pkg in user.pkg_profile:
+                    item_score[pkg] = user.item_score[pkg]
+                sample = {}
+                sample_size = int(profile_len*0.9)
+                for i in range(sample_size):
+                     key = random.choice(item_score.keys())
+                     sample[key] = item_score.pop(key)
+                iteration_user = User(item_score)
+                recommendation = rec.get_recommendation(iteration_user,threshold)
+                if hasattr(recommendation,"ranking"):
+                    results.add_result(recommendation.ranking,sample)
+                    recommended[k] = recommended[k].union(recommendation.ranking)
+            # save summary
+            roc_point = results.get_roc_point()
+            roc_summary[k].append(roc_point)
+            precision = results.get_precision_summary()
+            precision_summary[k].append(precision)
+            f05 = results.get_f05_summary()
+            f05_summary[k].append(f05)
+            mcc = results.get_mcc_summary()
+            mcc_summary[k].append(mcc)
+            with open(log_file+"-k%.3d"%k,'a') as f:
+                f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
+                        (roc_point[0],roc_point[1],precision,f05,mcc))
+    # back to main flow
+    with open(log_file,'a') as f:
+        plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
+        for k in neighbors:
+            coverage = len(recommended[size])/float(repo_size)
+            f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
+                    (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
+                     float(sum(f05_summary[k]))/len(f05_summary[k]),
+                     float(sum(mcc_summary[k]))/len(mcc_summary[k])))
+            plot_roc(k,roc_summary[k],log_file)
@@ -0,0 +1,274 @@
+#!/usr/bin/env python
+"""
+    recommender suite - recommender experiments suite 
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+
+#iterations = 3
+#sample_proportions = [0.9]
+#weighting = [('bm25',1.2)]
+#collaborative = ['knn']
+#content_based = []
+#hybrid = ['knnco']
+#profile_size = [50,100]
+#popcon_size = ["1000"]
+#neighbors = [50]
+
+iterations = 10
+sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
+weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
+content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
+collaborative = ['knn_eset','knn','knn_plus']
+hybrid = ['knnco','knnco_eset']
+
+profile_size = range(20,100,20)
+#popcon_size = [1000,10000,50000,'full']
+neighbors = range(10,510,50)
+
+def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
+    # Write recall log
+    output = open(("%s-%d" % (log_file,n)),'w')
+    output.write("# %s-n\n" % label["description"])
+    output.write("# %s-%d\n" % (label["values"],n))
+    output.write("\n%d %d %d\n" % \
+                 (repo_size,profile_size,len(sample)))
+    if hasattr(recommendation,"ranking"):
+        notfound = []
+        ranks = []
+        for pkg in sample.keys():
+            if pkg in recommendation.ranking:
+                ranks.append(recommendation.ranking.index(pkg))
+            else:
+                notfound.append(pkg)
+        for r in sorted(ranks):
+            output.write(str(r)+"\n")
+        if notfound:
+            output.write("Out of recommendation:\n")
+            for pkg in notfound:
+                output.write(pkg+"\n")
+    output.close()
+
+def plot_summary(precision,recall,f1,f05,accuracy,log_file):
+    # Plot metrics summary
+    g = Gnuplot.Gnuplot()
+    g('set style data lines')
+    g.xlabel('Recommendation size')
+    g.title("Setup: %s" % log_file.split("/")[-1])
+    g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
+           Gnuplot.Data(precision,title="Precision"),
+           Gnuplot.Data(recall,title="Recall"),
+           Gnuplot.Data(f1,title="F_1"),
+           Gnuplot.Data(f05,title="F_0.5"))
+    g.hardcopy(log_file+".png",terminal="png")
+    g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
+    g('set logscale x')
+    g('replot')
+    g.hardcopy(log_file+"-logscale.png",terminal="png")
+    g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
+
+def get_label(cfg,sample_proportion):
+    label = {}
+    if cfg.strategy in content_based:
+        label["description"] = "strategy-filter-profile-k1_bm25-sample"
+        label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
+                           (cfg.strategy,cfg.profile_size,
+                            cfg.pkgs_filter.split("/")[-1],
+                            cfg.bm25_k1,sample_proportion))
+    elif cfg.strategy in collaborative:
+       label["description"] = "strategy-knn-filter-k1_bm25-sample"
+       label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
+                          (cfg.strategy,cfg.k_neighbors,
+                           cfg.pkgs_filter.split("/")[-1],
+                           cfg.bm25_k1,sample_proportion))
+    elif cfg.strategy in hybrid:
+       label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
+       label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
+                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
+                           cfg.pkgs_filter.split("/")[-1],
+                           cfg.bm25_k1,sample_proportion))
+    else:
+        print "Unknown strategy"
+    return label
+
+class ExperimentResults:
+    def __init__(self,repo_size):
+        self.repository_size = repo_size
+        self.accuracy = {}
+        self.precision = {}
+        self.recall = {}
+        self.f1 = {}
+        self.f05 = {}
+        points = [1]+range(10,200,10)+range(200,self.repository_size,100)
+        for size in points:
+            self.accuracy[size] = []
+            self.precision[size] = []
+            self.recall[size] = []
+            self.f1[size] = []
+            self.f05[size] = []
+
+    def add_result(self,ranking,sample):
+        for size in self.accuracy.keys():
+            predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
+            real = RecommendationResult(sample)
+            evaluation = Evaluation(predicted,real,self.repository_size)
+            self.accuracy[size].append(evaluation.run(Accuracy()))
+            self.precision[size].append(evaluation.run(Precision()))
+            self.recall[size].append(evaluation.run(Recall()))
+            self.f1[size].append(evaluation.run(F_score(1)))
+            self.f05[size].append(evaluation.run(F_score(0.5)))
+
+    def get_precision_summary(self):
+        summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
+        return sorted(summary)
+
+    def get_recall_summary(self):
+        summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
+        return sorted(summary)
+
+    def get_f1_summary(self):
+        summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
+        return sorted(summary)
+
+    def get_f05_summary(self):
+        summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
+        return sorted(summary)
+
+    def get_accuracy_summary(self):
+        summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
+        return sorted(summary)
+
+    def best_precision(self):
+        size = max(self.precision, key = lambda x: max(self.precision[x]))
+        return (size,max(self.precision[size]))
+
+    def best_f1(self):
+        size = max(self.f1, key = lambda x: max(self.f1[x]))
+        return (size,max(self.f1[size]))
+
+    def best_f05(self):
+        size = max(self.f05, key = lambda x: max(self.f05[x]))
+        return (size,max(self.f05[size]))
+
+def run_strategy(cfg,user):
+    for weight in weighting:
+        cfg.weight = weight[0]
+        cfg.bm25_k1 = weight[1]
+        rec = Recommender(cfg)
+        repo_size = rec.items_repository.get_doccount()
+        for proportion in sample_proportions:
+            results = ExperimentResults(repo_size)
+            label = get_label(cfg,proportion)
+            log_file = "results/strategies/"+label["values"]
+            for n in range(iterations):
+                # Fill sample profile
+                profile_size = len(user.pkg_profile)
+                item_score = {}
+                for pkg in user.pkg_profile:
+                    item_score[pkg] = user.item_score[pkg]
+                sample = {}
+                sample_size = int(profile_size*proportion)
+                for i in range(sample_size):
+                     key = random.choice(item_score.keys())
+                     sample[key] = item_score.pop(key)
+                iteration_user = User(item_score)
+                recommendation = rec.get_recommendation(iteration_user,repo_size)
+                write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
+                if hasattr(recommendation,"ranking"):
+                    results.add_result(recommendation.ranking,sample)
+            with open(log_file,'w') as f:
+                precision_10 = sum(results.precision[10])/len(results.precision[10])
+                f1_10 = sum(results.f1[10])/len(results.f1[10])
+                f05_10 = sum(results.f05[10])/len(results.f05[10])
+                f.write("# %s\n# %s\n\ncoverage %d\n\n" %
+                        (label["description"],label["values"],recommendation.size))
+                f.write("# best results (recommendation size; metric)\n")
+                f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
+                        (results.best_precision()[0],results.best_precision()[1],
+                         results.best_f1()[0],results.best_f1()[1],
+                         results.best_f05()[0],results.best_f05()[1]))
+                f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
+                        (precision_10,f1_10,f05_10))
+            precision = results.get_precision_summary()
+            recall = results.get_recall_summary()
+            f1 = results.get_f1_summary()
+            f05 = results.get_f05_summary()
+            accuracy = results.get_accuracy_summary()
+            plot_summary(precision,recall,f1,f05,accuracy,log_file)
+
+def run_content(user,cfg):
+    for strategy in content_based:
+        cfg.strategy = strategy
+        for size in profile_size:
+            cfg.profile_size = size
+            run_strategy(cfg,user)
+
+def run_collaborative(user,cfg):
+    popcon_desktopapps = cfg.popcon_desktopapps
+    popcon_programs = cfg.popcon_programs
+    for strategy in collaborative:
+        cfg.strategy = strategy
+        for k in neighbors:
+            cfg.k_neighbors = k
+            #for size in popcon_size:
+            #    if size:
+            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
+            #        cfg.popcon_programs = popcon_programs+"_"+size
+            run_strategy(cfg,user)
+
+def run_hybrid(user,cfg):
+    popcon_desktopapps = cfg.popcon_desktopapps
+    popcon_programs = cfg.popcon_programs
+    for strategy in hybrid:
+        cfg.strategy = strategy
+        for k in neighbors:
+            cfg.k_neighbors = k
+            #for size in popcon_size:
+            #    if size:
+            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
+            #        cfg.popcon_programs = popcon_programs+"_"+size
+            for size in profile_size:
+                cfg.profile_size = size
+                run_strategy(cfg,user)
+
+if __name__ == '__main__':
+    #user = LocalSystem()
+    #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
+
+    cfg = Config()
+    user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
+    #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
+    user.filter_pkg_profile(cfg.pkgs_filter)
+    user.maximal_pkg_profile()
+
+    if "content" in sys.argv or len(sys.argv)<2:
+        run_content(user,cfg)
+    if "collaborative" in sys.argv or len(sys.argv)<2:
+        run_collaborative(user,cfg)
+    if "hybrid" in sys.argv or len(sys.argv)<2:
+        run_hybrid(user,cfg)
@@ -1,27 +0,0 @@
-[DEFAULT]
-repetitions = 1
-iterations = 10
-path = 'results'
-experiment = 'grid'
-weight = ['bm25', 'trad']
-;profile_size = range(10,100,10)
-;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
-sample = [0.6, 0.7, 0.8, 0.9]
-
-[content]
-strategy = ['cb','cbt','cbd']
-
-[clustering]
-experiment = 'single'
-;iterations = 4
-;medoids = range(2,6)
-iterations = 6
-medoids = [100,500,1000,5000,10000,50000]
-;disabled for this experiment
-weight = 0
-profile_size = 0
-sample = 0
-
-[colaborative]
-users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"]
-neighbors = range(10,1010,50)
@@ -0,0 +1,49 @@
+#! /usr/bin/env python
+"""
+    sample-popcon - extract a sample from popcon population
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import xapian
+import os
+import random
+import sys
+
+if __name__ == '__main__':
+    try:
+        sample_file = sys.argv[1]
+    	popcon = xapian.WritableDatabase(sys.argv[2],xapian.DB_OPEN)
+    except:
+        print "Usage: extract-sample-db sample_file popcon_index"
+        exit(1)
+    enquire = xapian.Enquire(popcon)
+    print sample_file.split("/")
+    new_popcon = xapian.WritableDatabase(sys.argv[2]+"-"+sample_file.split("/")[-1],xapian.DB_CREATE_OR_OVERWRITE)
+    print ("Popcon repository size: %d" % popcon.get_doccount())
+    for submission in open(sample_file):
+        print "ID"+submission.strip()
+        query = xapian.Query("ID"+submission.strip())
+        enquire.set_query(query)
+        mset = enquire.get_mset(0,20)
+        for m in mset:
+            print "Adding doc %s"%m.docid
+            new_popcon.add_document(popcon.get_document(m.docid))
+            print "Removing doc %s"%m.docid
+            popcon.delete_document(m.docid)
+    print ("Popcon repository size: %d" % popcon.get_doccount())
+    print ("Popcon repository size: %d" % new_popcon.get_doccount())
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+"""
+    hybrid-suite
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+
+#hybrid_strategies = ['knnco','knnco_eset']
+
+if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: hybrid strategy sample_file"
+        exit(1)
+
+    iterations = 20
+    profile_size = [10,40,70,100,170,240]
+    neighbor_size = [3,10,50,70,100,150,200]
+
+    #iterations = 1
+    #profile_size = [10,20,30]
+    #neighbor_size = [10,20,30]
+
+    cfg = Config()
+    population_sample = []
+    strategy = sys.argv[1]
+    sample_file = sys.argv[2]
+    sample_str = sample_file.split('/')[-1]
+    with open(sample_file,'r') as f:
+        for line in f.readlines():
+            user_id = line.strip('\n')
+            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
+    sample_dir = ("results/hybrid/%s/%s" % (sample_str,strategy))
+    if not os.path.exists(sample_dir):
+        os.makedirs(sample_dir)
+
+    cfg.strategy = strategy
+    p_10_summary = {}
+    f05_100_summary = {}
+    c_10 = {}
+    c_100 = {}
+
+    log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
+    graph_10 = {}
+    graph_100 = {}
+    graph_10_jpg = {}
+    graph_100_jpg = {}
+    comment_10 = {}
+    comment_100 = {}
+    for k in neighbor_size:
+        graph_10[k] = log_file+("-neighborhood%.3d-010.png"%k)
+        graph_100[k] = log_file+("-neighborhood%.3d-100.png"%k)
+        graph_10_jpg[k] = graph_10[k].strip(".png")+".jpg"
+        graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
+        comment_10[k] = graph_10_jpg[k]+".comment"
+        comment_100[k] = graph_100_jpg[k]+".comment"
+
+        with open(comment_10[k],'w') as f:
+            f.write("# %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# neighborhood\tprofile\tmean_p_10\tdev_p_10\tc_10\n\n")
+        with open(comment_100[k],'w') as f:
+            f.write("# %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# neighborhood\tprofile\tmean_f05_100\tdev_f05_100\tc_100\n\n")
+
+        c_10[k] = {}
+        c_100[k] = {}
+        p_10_summary[k] = {}
+        f05_100_summary[k] = {}
+        for size in profile_size:
+            c_10[k][size] = set()
+            c_100[k][size] = set()
+            p_10_summary[k][size] = []
+            f05_100_summary[k][size] = []
+            with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'w') as f:
+                f.write("# %s\n" % sample_str)
+                f.write("# strategy %s-neighborhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
+                f.write("# p_10\t\tf05_100\n\n")
+
+    # main loop per user
+    for submission_file in population_sample:
+        user = PopconSystem(submission_file)
+        user.filter_pkg_profile(cfg.pkgs_filter)
+        user.maximal_pkg_profile()
+        for k in neighbor_size:
+            cfg.k_neighbors = k
+            for size in profile_size:
+                cfg.profile_size = size
+                rec = Recommender(cfg)
+                repo_size = rec.items_repository.get_doccount()
+                p_10 = []
+                f05_100 = []
+                for n in range(iterations):
+                    # Fill sample profile
+                    profile_len = len(user.pkg_profile)
+                    item_score = {}
+                    for pkg in user.pkg_profile:
+                        item_score[pkg] = user.item_score[pkg]
+                    sample = {}
+                    sample_size = int(profile_len*0.9)
+                    for i in range(sample_size):
+                         key = random.choice(item_score.keys())
+                         sample[key] = item_score.pop(key)
+                    iteration_user = User(item_score)
+                    recommendation = rec.get_recommendation(iteration_user,repo_size)
+                    if hasattr(recommendation,"ranking"):
+                        ranking = recommendation.ranking
+                        real = RecommendationResult(sample)
+                        predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
+                        evaluation = Evaluation(predicted_10,real,repo_size)
+                        p_10.append(evaluation.run(Precision()))
+                        predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
+                        evaluation = Evaluation(predicted_100,real,repo_size)
+                        f05_100.append(evaluation.run(F_score(0.5)))
+                        c_10[k][size] = c_10[k][size].union(recommendation.ranking[:10])
+                        c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
+                # save summary
+                if p_10:
+                    p_10_summary[k][size].append(numpy.mean(p_10))
+                if f05_100:
+                    f05_100_summary[k][size].append(numpy.mean(f05_100))
+
+                with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'a') as f:
+                    f.write("%.4f\t\t%.4f\n" %
+                            (numpy.mean(p_10),numpy.mean(f05_100)))
+
+    # back to main flow
+    coverage_10 = {}
+    coverage_100 = {}
+    for k in neighbor_size:
+        coverage_10[k] = {}
+        coverage_100[k] = {}
+        with open(comment_10[k],'a') as f:
+            for size in profile_size:
+                coverage_10[k][size] = len(c_10[k][size])/float(repo_size)
+                f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
+                        (k,size,numpy.mean(p_10_summary[k][size]),
+                         numpy.std(p_10_summary[k][size]),coverage_10[k][size]))
+        with open(comment_100[k],'a') as f:
+            for size in profile_size:
+                coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
+                f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
+                        (k,size,numpy.mean(f05_100_summary[k][size]),
+                         numpy.std(f05_100_summary[k][size]),coverage_100[k][size]))
+
+    for k in neighbor_size:
+        # plot results summary
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('Profile size')
+        g.title("Setup: %s-neighborhood%3d (threshold 10)" % (cfg.strategy,k))
+        g.plot(Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
+                                    for i in p_10_summary[k].keys()]),title="Precision"),
+               Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
+                                    for i in p_10_summary[k].keys()]),title="Deviation",
+                                    with_="yerrorbar lt 2 pt 6"),
+               Gnuplot.Data(sorted([[i,coverage_10[k][i]]
+                                    for i in coverage_10[k].keys()]),title="Coverage"))
+        g.hardcopy(graph_10[k],terminal="png")
+
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('Profile size')
+        g.title("Setup: %s-neighborhood%3d (threshold 100)" % (cfg.strategy,k))
+        g.plot(Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
+                                    for i in f05_100_summary[k].keys()]),title="F05"),
+               Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
+                                    for i in f05_100_summary[k].keys()]),title="Deviation",
+                                    with_="yerrorbar lt 2 pt 6"),
+               Gnuplot.Data(sorted([[i,coverage_100[k][i]]
+                                    for i in coverage_100[k].keys()]),title="Coverage"))
+        g.hardcopy(graph_100[k],terminal="png")
@@ -1,152 +0,0 @@
-#!/usr/bin/env python
-"""
-    recommender suite - recommender experiments suite 
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import sys
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-import numpy
-
-def plot_roc(p,roc_points,log_file):
-    g = Gnuplot.Gnuplot()
-    g('set style data points')
-    g.xlabel('False Positive Rate')
-    g.ylabel('True Positive Rate')
-    g('set xrange [0:1.0]')
-    g('set yrange [0:1.0]')
-    g.title("Setup: %s" % log_file.split("/")[-1])
-    g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
-           Gnuplot.Data(roc_points,title="k %d"%k))
-    g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
-    g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
-
-class ExperimentResults:
-    def __init__(self,repo_size):
-        self.repository_size = repo_size
-        self.precision = []
-        self.recall = []
-        self.fpr = []
-
-    def add_result(self,ranking,sample):
-        predicted = RecommendationResult(dict.fromkeys(ranking,1))
-        real = RecommendationResult(sample)
-        evaluation = Evaluation(predicted,real,self.repository_size)
-        self.precision.append(evaluation.run(Precision()))
-        self.recall.append(evaluation.run(Recall()))
-        self.fpr.append(evaluation.run(FPR()))
-
-    # Average ROC by threshold (whici is the size)
-    def get_roc_point(self):
-        tpr = self.recall
-        fpr = self.fpr
-        return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
-
-    def get_precision_summary(self):
-        return  sum(self.precision)/len(self.precision)
-
-    def get_recall_summary(self):
-        return  sum(self.recall)/len(self.recall)
-
-if __name__ == '__main__':
-    # experiment parameters
-    threshold = 20
-    iterations = 30
-    sample_file = "results/misc-popcon/sample-050-100"
-    neighbors = [3,5,10,50,100,150,200,300,400,500]
-    cfg = Config()
-    cfg.strategy = "knn"
-    print cfg.popcon_index
-    sample = []
-    with open(sample_file,'r') as f:
-        for line in f.readlines():
-            user_id = line.strip('\n')
-            sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
-    # setup dictionaries and files
-    roc_points = {}
-    recommended = {}
-    precisions = {}
-    aucs = {}
-    log_file = "results/k-suite/sample-050-100/%s" % (cfg.strategy)
-    for k in neighbors:
-        roc_points[k] = []
-        recommended[k] = set()
-        precisions[k] = []
-        aucs[k] = []
-        with open(log_file+"-k%.3d"%k,'w') as f:
-            f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
-            f.write("# roc_point \tp(20) \tauc\n\n") 
-    # main loop per user
-    for submission_file in sample:
-        user = PopconSystem(submission_file)
-        user.filter_pkg_profile(cfg.pkgs_filter)
-        user.maximal_pkg_profile()
-        for k in neighbors:
-            cfg.k_neighbors = k
-            rec = Recommender(cfg)
-            repo_size = rec.items_repository.get_doccount()
-            results = ExperimentResults(repo_size)
-            # n iterations for same recommender and user
-            for n in range(iterations):
-                # Fill sample profile
-                profile_size = len(user.pkg_profile)
-                item_score = {}
-                for pkg in user.pkg_profile:
-                    item_score[pkg] = user.item_score[pkg]
-                sample = {}
-                sample_size = int(profile_size*0.9)
-                for i in range(sample_size):
-                     key = random.choice(item_score.keys())
-                     sample[key] = item_score.pop(key)
-                iteration_user = User(item_score)
-                recommendation = rec.get_recommendation(iteration_user,threshold)
-                if hasattr(recommendation,"ranking"):
-                    results.add_result(recommendation.ranking,sample)
-                    print "ranking",recommendation.ranking
-                    print "recommended_%d"%k,recommended[k]
-                    recommended[k] = recommended[k].union(recommendation.ranking)
-                    print recommended[k]
-            # save summary
-            roc_point = results.get_roc_point()
-            auc = numpy.trapz(y=[0,roc_point[1],1],x=[0,roc_point[0],1])
-            p_20 = results.get_precision_summary()
-            roc_points[k].append(roc_point)
-            aucs[k].append(auc)
-            precisions[k].append(p_20)
-            with open(log_file+"-k%.3d"%k,'a') as f:
-                f.write("%s \t%.2f \t%.4f\n" % (str(roc_point),p_20,auc))
-    # back to main flow
-    with open(log_file,'w') as f:
-        f.write("# k coverage \tp(20) \tauc\n\n")
-        for k in neighbors:
-            print "len_recommended_%d"%k,len(recommended[k])
-            print "repo_size",repo_size
-            coverage = len(recommended[k])/float(repo_size)
-            print coverage
-            f.write("%d \t%.2f \t%.2f \t%.2fi\n" %
-                    (k,coverage,float(sum(precisions[k]))/len(precisions[k]),
-                     float(sum(aucs[k]))/len(aucs[k])))
-            plot_roc(k,roc_points[k],log_file)
@@ -1,51 +0,0 @@
-#!/usr/bin/env python
-"""
-    recommender suite - recommender experiments suite 
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import sys
-import os
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-
-if __name__ == '__main__':
-
-    cfg = Config()
-    cfg.index_mode = "recluster"
-    logging.info("Starting clustering experiments")
-    logging.info("Medoids: %d\t Max popcon:%d" % (cfg.k_medoids,cfg.max_popcon))
-    cfg.popcon_dir = os.path.expanduser("~/org/popcon.debian.org/popcon-mail/popcon-entries/")
-    cfg.popcon_index = cfg.popcon_index+("_%dmedoids%dmax" %
-                                         (cfg.k_medoids,cfg.max_popcon))
-    cfg.clusters_dir = cfg.clusters_dir+("_%dmedoids%dmax" %
-                                         (cfg.k_medoids,cfg.max_popcon))
-    pxi = PopconXapianIndex(cfg)
-    logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion)
-    # Write clustering log
-    output = open(("results/clustering/%dmedoids%dmax" % (cfg.k_medoids,cfg.max_popcon)),'w')
-    output.write("# k_medoids\tmax_popcon\tdispersion\n")
-    output.write("%d %f\n" % (cfg.k_medoids,cfg.max_popcon,pxi.cluster_dispersion))
-    output.close()
@@ -1,27 +0,0 @@
-[DEFAULT]
-repetitions = 1
-iterations = 10
-path = 'results'
-experiment = 'grid'
-weight = ['bm25', 'trad']
-;profile_size = range(10,100,10)
-;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
-sample = [0.6, 0.7, 0.8, 0.9]
-
-[content]
-strategy = ['cb','cbt','cbd']
-
-[clustering]
-experiment = 'single'
-;iterations = 4
-;medoids = range(2,6)
-iterations = 6
-medoids = [100,500,1000,5000,10000,50000]
-;disabled for this experiment
-weight = 0
-profile_size = 0
-sample = 0
-
-[colaborative]
-users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"]
-neighbors = range(10,1010,50)
@@ -1,171 +0,0 @@
-#!/usr/bin/env python
-"""
-    recommender suite - recommender experiments suite 
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import expsuite
-import sys
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-
-class ClusteringSuite(expsuite.PyExperimentSuite):
-    def reset(self, params, rep):
-        self.cfg = Config()
-        self.cfg.popcon_index = "../tests/test_data/.sample_pxi"
-        self.cfg.popcon_dir = "../tests/test_data/popcon_dir"
-        self.cfg.clusters_dir = "../tests/test_data/clusters_dir"
-
-        if params['name'] == "clustering":
-            logging.info("Starting 'clustering' experiments suite...")
-            self.cfg.index_mode = "recluster"
-
-    def iterate(self, params, rep, n):
-        if params['name'] == "clustering":
-            logging.info("Running iteration %d" % params['medoids'][n])
-            self.cfg.k_medoids = params['medoids'][n]
-            pxi = PopconXapianIndex(self.cfg)
-            result = {'k_medoids': params['medoids'][n],
-                   'dispersion': pxi.cluster_dispersion}
-        else:
-            result = {}
-        return result
-
-class ContentBasedSuite(expsuite.PyExperimentSuite):
-    def reset(self, params, rep):
-        if params['name'].startswith("content"):
-            cfg = Config()
-            #if the index was not built yet
-            #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi")
-            cfg.axi = "data/AppAxi"
-            cfg.index_mode = "old"
-            cfg.weight = params['weight']
-            self.rec = Recommender(cfg)
-            self.rec.set_strategy(params['strategy'])
-            self.repo_size = self.rec.items_repository.get_doccount()
-            self.user = LocalSystem()
-            self.user.app_pkg_profile(self.rec.items_repository)
-            self.user.no_auto_pkg_profile()
-            self.sample_size = int(len(self.user.pkg_profile)*params['sample'])
-            # iteration should be set to 10 in config file
-            #self.profile_size = range(10,101,10)
-
-    def iterate(self, params, rep, n):
-        if params['name'].startswith("content"):
-            item_score = dict.fromkeys(self.user.pkg_profile,1)
-            # Prepare partition
-            sample = {}
-            for i in range(self.sample_size):
-                 key = random.choice(item_score.keys())
-                 sample[key] = item_score.pop(key)
-            # Get full recommendation
-            user = User(item_score)
-            recommendation = self.rec.get_recommendation(user,self.repo_size)
-            # Write recall log
-            recall_file = "results/content/recall/%s-%s-%.2f-%d" % \
-                          (params['strategy'],params['weight'],params['sample'],n)
-            output = open(recall_file,'w')
-            output.write("# weight=%s\n" % params['weight'])
-            output.write("# strategy=%s\n" % params['strategy'])
-            output.write("# sample=%f\n" % params['sample'])
-            output.write("\n%d %d %d\n" % \
-                         (self.repo_size,len(item_score),self.sample_size))
-            notfound = []
-            ranks = []
-            for pkg in sample.keys():
-                if pkg in recommendation.ranking:
-                    ranks.append(recommendation.ranking.index(pkg))
-                else:
-                    notfound.append(pkg)
-            for r in sorted(ranks):
-                output.write(str(r)+"\n")
-            if notfound:
-                output.write("Out of recommendation:\n")
-                for pkg in notfound:
-                    output.write(pkg+"\n")
-            output.close()
-            # Plot metrics summary
-            accuracy = []
-            precision = []
-            recall = []
-            f1 = []
-            g = Gnuplot.Gnuplot()
-            g('set style data lines')
-            g.xlabel('Recommendation size')
-            for size in range(1,len(recommendation.ranking)+1,100):
-                predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
-                real = RecommendationResult(sample)
-                evaluation = Evaluation(predicted,real,self.repo_size)
-                accuracy.append([size,evaluation.run(Accuracy())])
-                precision.append([size,evaluation.run(Precision())])
-                recall.append([size,evaluation.run(Recall())])
-                f1.append([size,evaluation.run(F1())])
-            g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
-                   Gnuplot.Data(precision,title="Precision"),
-                   Gnuplot.Data(recall,title="Recall"),
-                   Gnuplot.Data(f1,title="F1"))
-            g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1)
-            # Iteration log
-            result = {'iteration': n,
-                      'weight': params['weight'],
-                      'strategy': params['strategy'],
-                      'accuracy': accuracy[20],
-                      'precision': precision[20],
-                      'recall:': recall[20],
-                      'f1': f1[20]}
-            return result
-
-#class CollaborativeSuite(expsuite.PyExperimentSuite):
-#    def reset(self, params, rep):
-#        if params['name'].startswith("collaborative"):
-#
-#    def iterate(self, params, rep, n):
-#        if params['name'].startswith("collaborative"):
-#            for root, dirs, files in os.walk(self.source_dir):
-#                for popcon_file in files:
-#                    submission = PopconSubmission(os.path.join(root,popcon_file))
-#                    user = User(submission.packages)
-#                    user.maximal_pkg_profile()
-#                    rec.get_recommendation(user)
-#                    precision = 0
-#                    result = {'weight': params['weight'],
-#                              'strategy': params['strategy'],
-#                              'profile_size': self.profile_size[n],
-#                              'accuracy': accuracy,
-#                              'precision': precision,
-#                              'recall:': recall,
-#                              'f1': }
-#        else:
-#            result = {}
-#        return result
-
-if __name__ == '__main__':
-
-    if "clustering" in sys.argv or len(sys.argv)<3:
-        ClusteringSuite().start()
-    if "content" in sys.argv or len(sys.argv)<3:
-        ContentBasedSuite().start()
-    #if "collaborative" in sys.argv or len(sys.argv)<3:
-    #CollaborativeSuite().start()
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+"""
+    profile-suite - experiment different profile sizes
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+
+if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: pure strategy_category sample_file"
+        exit(1)
+
+    iterations = 20
+    profile_size = [10,20,40,60,80,100,140,170,200,240]
+    neighbor_size = [3,5,10,20,30,50,70,100,150,200]
+
+    content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
+    collaborative_strategies = ['knn_eset','knn','knn_plus']
+
+    #iterations = 1
+    #profile_size = [10,20,30]
+    #neighbor_size = [3,5,10,20,30,50]
+    #content_strategies = ['cb']
+    #collaborative_strategies = ['knn']
+
+    strategy_category = sys.argv[1]
+    if strategy_category == "content":
+        strategies = content_strategies
+        sizes = profile_size
+        option_str = "profile"
+    elif strategy_category == "collaborative":
+        strategies = collaborative_strategies
+        sizes = neighbor_size
+        option_str = "neighborhood"
+    else:
+        print "Usage: profile-suite strategy_category sample_file"
+        exit(1)
+
+    cfg = Config()
+    population_sample = []
+    sample_file = sys.argv[2]
+    sample_str = sample_file.split('/')[-1]
+    with open(sample_file,'r') as f:
+        for line in f.readlines():
+            user_id = line.strip('\n')
+            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
+    sample_dir = ("results/%s/%s" %
+                  (strategy_category,sample_str))
+    if not os.path.exists(sample_dir):
+        os.makedirs(sample_dir)
+
+    for strategy in strategies:
+        cfg.strategy = strategy
+        p_10_summary = {}
+        f05_100_summary = {}
+        c_10 = {}
+        c_100 = {}
+
+        log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
+        graph_10 = log_file+"-10.png"
+        graph_100 = log_file+"-100.png"
+        graph_10_jpg = graph_10.strip(".png")+".jpg"
+        graph_100_jpg = graph_100.strip(".png")+".jpg"
+        comment_10 = graph_10_jpg+".comment"
+        comment_100 = graph_100_jpg+".comment"
+
+        with open(comment_10,'w') as f:
+            f.write("# sample %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# %s\tmean_p_10\tdev_p_10\tc_10\n\n"%option_str)
+        with open(comment_100,'w') as f:
+            f.write("# sample %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# %s\t\tmean_f05_100\t\tdev_f05_100\t\tc_100\n\n"%option_str)
+
+        for size in sizes:
+            c_10[size] = set()
+            c_100[size] = set()
+            p_10_summary[size] = []
+            f05_100_summary[size] = []
+            with open(log_file+"-%s%.3d"%(option_str,size),'w') as f:
+                f.write("# sample %s\n" % sample_str)
+                f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size))
+                f.write("# p_10\tf05_100\n\n")
+
+        # main loop per user
+        for submission_file in population_sample:
+            user = PopconSystem(submission_file)
+            user.filter_pkg_profile(cfg.pkgs_filter)
+            user.maximal_pkg_profile()
+            for size in sizes:
+                cfg.profile_size = size
+                cfg.k_neighbors = size
+                rec = Recommender(cfg)
+                repo_size = rec.items_repository.get_doccount()
+                p_10 = []
+                f05_100 = []
+                for n in range(iterations):
+                    # Fill sample profile
+                    profile_len = len(user.pkg_profile)
+                    item_score = {}
+                    for pkg in user.pkg_profile:
+                        item_score[pkg] = user.item_score[pkg]
+                    sample = {}
+                    sample_size = int(profile_len*0.9)
+                    for i in range(sample_size):
+                         key = random.choice(item_score.keys())
+                         sample[key] = item_score.pop(key)
+                    iteration_user = User(item_score)
+                    recommendation = rec.get_recommendation(iteration_user,repo_size)
+                    if hasattr(recommendation,"ranking"):
+                        ranking = recommendation.ranking
+                        real = RecommendationResult(sample)
+                        predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
+                        evaluation = Evaluation(predicted_10,real,repo_size)
+                        p_10.append(evaluation.run(Precision()))
+                        predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
+                        evaluation = Evaluation(predicted_100,real,repo_size)
+                        f05_100.append(evaluation.run(F_score(0.5)))
+                        c_10[size] = c_10[size].union(recommendation.ranking[:10])
+                        c_100[size] = c_100[size].union(recommendation.ranking[:100])
+                # save summary
+                if p_10:
+                    p_10_summary[size].append(numpy.mean(p_10))
+                if f05_100:
+                    f05_100_summary[size].append(numpy.mean(f05_100))
+
+                with open(log_file+"-%s%.3d"%(option_str,size),'a') as f:
+                    f.write("%.4f \t%.4f\n" % (numpy.mean(p_10),numpy.mean(f05_100)))
+
+        # back to main flow
+        coverage_10 = {}
+        coverage_100 = {}
+        with open(comment_10,'a') as f:
+            for size in sizes:
+                coverage_10[size] = len(c_10[size])/float(repo_size)
+                f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
+                        (size,numpy.mean(p_10_summary[size]),numpy.std(p_10_summary[size]),coverage_10[size]))
+        with open(comment_100,'a') as f:
+            for size in sizes:
+                coverage_100[size] = len(c_100[size])/float(repo_size)
+                f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
+                        (size,numpy.mean(f05_100_summary[size]),numpy.std(f05_100_summary[size]),coverage_100[size]))
+
+        # plot results summary
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('%s size'%option_str.capitalize())
+        g.title("Setup: %s (threshold 10)" % cfg.strategy)
+        g.plot(Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
+                                    for k in p_10_summary.keys()]),title="Precision"),
+               Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
+                                    for k in p_10_summary.keys()]),title="Deviation",
+                                    with_="yerrorbar lt 2 pt 6"),
+               Gnuplot.Data(sorted([[k,coverage_10[k]]
+                                    for k in coverage_10.keys()]),title="Coverage"))
+        g.hardcopy(graph_10,terminal="png")
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('%s size'%option_str.capitalize())
+        g.title("Setup: %s (threshold 100)" % cfg.strategy)
+        g.plot(Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
+                                    for k in f05_100_summary.keys()]),title="F05"),
+               Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
+                                    for k in f05_100_summary.keys()]),title="Deviation",
+                                    with_="yerrorbar lt 2 pt 6"),
+               Gnuplot.Data(sorted([[k,coverage_100[k]]
+                                    for k in coverage_100.keys()]),title="Coverage"))
+        g.hardcopy(graph_100,terminal="png")
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+"""
+    recommender suite - recommender experiments suite 
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+import shutil
+
+def plot_roc(results,log_file,mean=0):
+    g = Gnuplot.Gnuplot()
+    g('set style data lines')
+    g.xlabel('False Positive Rate')
+    g.ylabel('True Positive Rate')
+    g('set xrange [0:1.0]')
+    g('set yrange [0:1.0]')
+    g.title("Setup: %s" % log_file.split("/")[-1])
+    g('set label "C %.4f" at 0.68,0.2' % results.coverage())
+    g('set label "AUC %.4f" at 0.68,0.15' % results.get_auc())
+    g('set label "P(10) %.2f +- %.2f" at 0.68,0.10' % (numpy.mean(results.precision[10]),numpy.std(results.precision[10])))
+    g('set label "F05(100) %.2f +- %.2f" at 0.68,0.05' % (numpy.mean(results.f05[100]),numpy.std(results.f05[100])))
+    if mean==1:
+        g.plot(Gnuplot.Data(results.get_roc_points(),title="mean ROC"),
+               Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
+        g.hardcopy(log_file+"-roc-mean.png",terminal="png")
+        g.hardcopy(log_file+"-roc-mean.ps",terminal="postscript",enhanced=1,color=1)
+    else:
+        g.plot(Gnuplot.Data(results.get_roc_points(),title="ROC",with_="xyerrorbars"),
+               Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
+        g.hardcopy(log_file+"-roc.png",terminal="png")
+        g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
+
+def get_label(cfg):
+    label = {}
+    if cfg.strategy in content_based:
+        label["description"] = "strategy-profile"
+        label["values"] = ("%s-profile%.3d" %
+                           (cfg.strategy,cfg.profile_size))
+    elif cfg.strategy in collaborative:
+       label["description"] = "strategy-knn"
+       label["values"] = ("%s-k%.3d" %
+                          (cfg.strategy,cfg.k_neighbors))
+    elif cfg.strategy in hybrid:
+       label["description"] = "strategy-knn-profile"
+       label["values"] = ("%s-k%.3d-profile%.3d" %
+                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
+    return label
+
+class ExperimentResults:
+    def __init__(self,repo_size):
+        self.repository_size = repo_size
+        self.precision = {}
+        self.recall = {}
+        self.fpr = {}
+        self.f05 = {}
+        self.recommended = {}
+        self.thresholds = [1]+range(10,self.repository_size,10)
+        for size in self.thresholds:
+            self.precision[size] = []
+            self.recall[size] = []
+            self.fpr[size] = []
+            self.f05[size] = []
+            self.recommended[size] = set()
+
+    def add_result(self,ranking,sample):
+        for size in self.thresholds:
+            recommendation = ranking[:size]
+            self.recommended[size] = self.recommended[size].union(recommendation)
+            predicted = RecommendationResult(dict.fromkeys(recommendation,1))
+            real = RecommendationResult(sample)
+            evaluation = Evaluation(predicted,real,self.repository_size)
+            self.precision[size].append(evaluation.run(Precision()))
+            self.recall[size].append(evaluation.run(Recall()))
+            self.f05[size].append(evaluation.run(F_score(0.5)))
+            self.fpr[size].append(evaluation.run(FPR()))
+
+    def precision_summary(self):
+        return [[size,numpy.mean(self.precision[size])] for size in self.thresholds]
+
+    def recall_summary(self):
+        return [[size,numpy.mean(self.recall[size])] for size in self.thresholds]
+
+    def f05_summary(self):
+        return [[size,numpy.mean(self.f05[size])] for size in self.thresholds]
+
+    def coverage_summary(self):
+        return [[size,self.coverage(size)] for size in self.thresholds]
+
+    def coverage(self,size=0):
+        if not size:
+            size = self.thresholds[-1]
+        return len(self.recommended[size])/float(self.repository_size)
+
+    def precision(self,size):
+        return numpy.mean(results.precision[size])
+
+    def get_auc(self):
+        roc_points = self.get_roc_points()
+        x_roc = [p[0] for p in roc_points]
+        y_roc = [p[1] for p in roc_points]
+        x_roc.insert(0,0)
+        y_roc.insert(0,0)
+        x_roc.append(1)
+        y_roc.append(1)
+        return numpy.trapz(y=y_roc, x=x_roc)
+
+    # Average ROC by threshold (= size of recommendation)
+    def get_roc_points(self):
+        points = []
+        for size in self.recall.keys():
+            tpr = self.recall[size]
+            fpr = self.fpr[size]
+            points.append([numpy.mean(fpr),numpy.mean(tpr),numpy.std(fpr),numpy.std(tpr)])
+        return sorted(points)
+
+def run_strategy(cfg,sample_file):
+    rec = Recommender(cfg)
+    repo_size = rec.items_repository.get_doccount()
+    results = ExperimentResults(repo_size)
+    label = get_label(cfg)
+    population_sample = []
+    sample_str = sample_file.split('/')[-1]
+    with open(sample_file,'r') as f:
+        for line in f.readlines():
+            user_id = line.strip('\n')
+            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
+    sample_dir = ("results/roc-sample/%s" % sample_str)
+    if not os.path.exists(sample_dir):
+        os.makedirs(sample_dir)
+    log_file = os.path.join(sample_dir,label["values"])
+
+    # n iterations per population user
+    for submission_file in population_sample:
+        user = PopconSystem(submission_file)
+        user.filter_pkg_profile(cfg.pkgs_filter)
+        user.maximal_pkg_profile()
+        for n in range(iterations):
+            # Fill sample profile
+            profile_len = len(user.pkg_profile)
+            item_score = {}
+            for pkg in user.pkg_profile:
+                item_score[pkg] = user.item_score[pkg]
+            sample = {}
+            sample_size = int(profile_len*0.9)
+            for i in range(sample_size):
+                 key = random.choice(item_score.keys())
+                 sample[key] = item_score.pop(key)
+            iteration_user = User(item_score)
+            recommendation = rec.get_recommendation(iteration_user,repo_size)
+            if hasattr(recommendation,"ranking"):
+                results.add_result(recommendation.ranking,sample)
+
+    plot_roc(results,log_file)
+    plot_roc(results,log_file,1)
+    with open(log_file+"-roc.jpg.comment",'w') as f:
+        f.write("# %s\n# %s\n\n" %
+                (label["description"],label["values"]))
+        f.write("# roc AUC\n%.4f\n\n"%results.get_auc())
+        f.write("# threshold\tmean_fpr\tdev_fpr\t\tmean_tpr\tdev_tpr\t\tcoverage\n")
+        for size in results.thresholds:
+            f.write("%4d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\n" %
+                    (size,numpy.mean(results.fpr[size]),
+                     numpy.std(results.fpr[size]),
+                     numpy.mean(results.recall[size]),
+                     numpy.std(results.recall[size]),
+                     numpy.mean(results.coverage(size))))
+
+def run_content(cfg,sample_file):
+    for size in profile_size:
+        cfg.profile_size = size
+        run_strategy(cfg,sample_file)
+
+def run_collaborative(cfg,sample_file):
+    for k in neighbors:
+        cfg.k_neighbors = k
+        run_strategy(cfg,sample_file)
+
+def run_hybrid(cfg,sample_file):
+    for k in neighbors:
+        cfg.k_neighbors = k
+        for size in profile_size:
+            cfg.profile_size = size
+            run_strategy(cfg,sample_file)
+
+if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: sample-roc strategy_str [popcon_sample_path]"
+        exit(1)
+
+    #iterations = 3
+    #content_based = ['cb']
+    #collaborative = ['knn_eset']
+    #hybrid = ['knnco']
+    #profile_size = [50,100]
+    #neighbors = [50]
+    iterations = 20
+    content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
+    collaborative = ['knn_eset','knn','knn_plus']
+    hybrid = ['knnco','knnco_eset']
+    profile_size = [10,20,50,100,200]
+    neighbors = [200]
+    #neighbors = [3,10,50,100,200]
+    #profile_size = [10,20,40,60,80,100,140,170,200,240]
+    #neighbors = [3,5,10,20,30,50,70,100,150,200]
+    
+    cfg = Config()
+    cfg.strategy = sys.argv[1]
+    sample_file = sys.argv[2]
+
+    if cfg.strategy in content_based:
+        run_content(cfg,sample_file)
+    if cfg.strategy in collaborative:
+        run_collaborative(cfg,sample_file)
+    if cfg.strategy in hybrid:
+        run_hybrid(cfg,sample_file)
@@ -0,0 +1,269 @@
+#!/usr/bin/env python
+"""
+    recommender suite - recommender experiments suite 
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+import shutil
+
+def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
+    # Write recall log
+    output = open(("%s-%.2d" % (log_file,n)),'w')
+    output.write("# %s-n\n" % label["description"])
+    output.write("# %s-%.2d\n" % (label["values"],n))
+    output.write("\n# repository profile sample\n%d %d %d\n" % \
+                 (repo_size,profile_size,len(sample)))
+    if hasattr(recommendation,"ranking"):
+        notfound = []
+        ranks = []
+        for pkg in sample.keys():
+            if pkg in recommendation.ranking:
+                ranks.append(recommendation.ranking.index(pkg))
+            else:
+                notfound.append(pkg)
+        for r in sorted(ranks):
+            output.write(str(r)+"\n")
+        if notfound:
+            output.write("# out of recommendation:\n")
+            for pkg in notfound:
+                output.write(pkg+"\n")
+    output.close()
+
+def plot_summary(results,log_file):
+    # Plot metrics summary
+    g = Gnuplot.Gnuplot()
+    g('set style data lines')
+    g('set yrange [0:1.0]')
+    g.xlabel('Threshold (recommendation size)')
+    g.title("Setup: %s" % log_file.split("/")[-1])
+    g.plot(Gnuplot.Data(results.precision_summary(),title="Precision"),
+           Gnuplot.Data(results.recall_summary(),title="Recall"),
+           Gnuplot.Data(results.f05_summary(),title="F05"),
+           Gnuplot.Data(results.coverage_summary(),title="Coverage"))
+    g.hardcopy(log_file+".png",terminal="png")
+    g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
+    g('set logscale x')
+    g('replot')
+    g.hardcopy(log_file+"-logscale.png",terminal="png")
+    g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
+
+def plot_roc(results,log_file):
+    g = Gnuplot.Gnuplot()
+    g('set style data lines')
+    g.xlabel('False Positive Rate')
+    g.ylabel('True Positive Rate')
+    g('set xrange [0:1.0]')
+    g('set yrange [0:1.0]')
+    g.title("Setup: %s" % log_file.split("/")[-1])
+    g('set label "C %.2f" at 0.8,0.25' % results.coverage())
+    g('set label "AUC %.2f" at 0.8,0.2' % results.get_auc())
+    g('set label "P(10) %.2f" at 0.8,0.15' % numpy.mean(results.precision[10]))
+    g('set label "P(20) %.2f" at 0.8,0.10' % numpy.mean(results.precision[20]))
+    g('set label "F05(100) %.2f" at 0.8,0.05' % numpy.mean(results.f05[100]))
+    g.plot(Gnuplot.Data(results.get_roc_points(),title="ROC"),
+           Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
+           #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
+    g.hardcopy(log_file+"-roc.png",terminal="png")
+    g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
+
+def get_label(cfg):
+    label = {}
+    if cfg.strategy in content_based:
+        label["description"] = "strategy-profile"
+        label["values"] = ("%s-profile%.3d" %
+                           (cfg.strategy,cfg.profile_size))
+    elif cfg.strategy in collaborative:
+       label["description"] = "strategy-knn"
+       label["values"] = ("%s-k%.3d" %
+                          (cfg.strategy,cfg.k_neighbors))
+    elif cfg.strategy in hybrid:
+       label["description"] = "strategy-knn-profile"
+       label["values"] = ("%s-k%.3d-profile%.3d" %
+                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
+    return label
+
+class ExperimentResults:
+    def __init__(self,repo_size):
+        self.repository_size = repo_size
+        self.precision = {}
+        self.recall = {}
+        self.fpr = {}
+        self.f05 = {}
+        self.recommended = {}
+        self.thresholds = [1]+range(10,self.repository_size,10)
+        for size in self.thresholds:
+            self.precision[size] = []
+            self.recall[size] = []
+            self.fpr[size] = []
+            self.f05[size] = []
+            self.recommended[size] = set()
+
+    def add_result(self,ranking,sample):
+        for size in self.thresholds:
+            recommendation = ranking[:size]
+            self.recommended[size] = self.recommended[size].union(recommendation)
+            predicted = RecommendationResult(dict.fromkeys(recommendation,1))
+            real = RecommendationResult(sample)
+            evaluation = Evaluation(predicted,real,self.repository_size)
+            print evaluation.run(Precision())
+            self.precision[size].append(evaluation.run(Precision()))
+            self.recall[size].append(evaluation.run(Recall()))
+            self.f05[size].append(evaluation.run(F_score(0.5)))
+            self.fpr[size].append(evaluation.run(FPR()))
+
+    def precision_summary(self):
+        return [[size,numpy.mean(self.precision[size])] for size in self.thresholds]
+
+    def recall_summary(self):
+        return [[size,numpy.mean(self.recall[size])] for size in self.thresholds]
+
+    def f05_summary(self):
+        return [[size,numpy.mean(self.f05[size])] for size in self.thresholds]
+
+    def coverage_summary(self):
+        return [[size,self.coverage(size)] for size in self.thresholds]
+
+    def coverage(self,size=0):
+        if not size:
+            size = self.thresholds[-1]
+        return len(self.recommended[size])/float(self.repository_size)
+
+    def precision(self,size):
+        return numpy.mean(results.precision[size])
+
+    def get_auc(self):
+        roc_points = self.get_roc_points()
+        x_roc = [p[0] for p in roc_points]
+        y_roc = [p[1] for p in roc_points]
+        x_roc.insert(0,0)
+        y_roc.insert(0,0)
+        x_roc.append(1)
+        y_roc.append(1)
+        return numpy.trapz(y=y_roc, x=x_roc)
+
+    # Average ROC by threshold (= size of recommendation)
+    def get_roc_points(self):
+        points = []
+        for size in self.recall.keys():
+            tpr = self.recall[size]
+            fpr = self.fpr[size]
+            points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
+        return sorted(points)
+
+def run_strategy(cfg,user):
+    rec = Recommender(cfg)
+    repo_size = rec.items_repository.get_doccount()
+    results = ExperimentResults(repo_size)
+    label = get_label(cfg)
+    user_dir = ("results/roc-suite/%s/%s" % (user.user_id[:8],cfg.strategy))
+    if not os.path.exists(user_dir):
+        os.makedirs(user_dir)
+    log_file = os.path.join(user_dir,label["values"])
+    for n in range(iterations):
+        # Fill sample profile
+        profile_len = len(user.pkg_profile)
+        item_score = {}
+        for pkg in user.pkg_profile:
+            item_score[pkg] = user.item_score[pkg]
+        sample = {}
+        sample_size = int(profile_len*0.9)
+        for i in range(sample_size):
+             key = random.choice(item_score.keys())
+             sample[key] = item_score.pop(key)
+        iteration_user = User(item_score)
+        recommendation = rec.get_recommendation(iteration_user,repo_size)
+        write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
+        if hasattr(recommendation,"ranking"):
+            results.add_result(recommendation.ranking,sample)
+    with open(log_file+"-roc.jpg.comment",'w') as f:
+        f.write("# %s\n# %s\n\n" %
+                (label["description"],label["values"]))
+        f.write("# roc AUC\n%.4f\n\n"%results.get_auc())
+        f.write("# threshold\tprecision\trecall\t\tf05\t\tcoverage\n")
+        for size in results.thresholds:
+            f.write("%4d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\n" %
+                    (size,numpy.mean(results.precision[size]),
+                     numpy.mean(results.recall[size]),
+                     numpy.mean(results.f05[size]),
+                     numpy.mean(results.coverage(size))))
+    shutil.copy(log_file+"-roc.jpg.comment",log_file+".jpg.comment")
+    shutil.copy(log_file+"-roc.jpg.comment",log_file+"-logscale.jpg.comment")
+    plot_roc(results,log_file)
+    plot_summary(results,log_file)
+
+def run_content(user,cfg):
+    for size in profile_size:
+        cfg.profile_size = size
+        run_strategy(cfg,user)
+
+def run_collaborative(user,cfg):
+    for k in neighbors:
+        cfg.k_neighbors = k
+        run_strategy(cfg,user)
+
+def run_hybrid(user,cfg):
+    for k in neighbors:
+        cfg.k_neighbors = k
+        for size in profile_size:
+            cfg.profile_size = size
+            run_strategy(cfg,user)
+
+if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: roc-suite strategy_str [popcon_submission_path]"
+        exit(1)
+
+    #iterations = 3
+    #content_based = ['cb']
+    #collaborative = ['knn_eset']
+    #hybrid = ['knnco']
+    #profile_size = [50,100]
+    #neighbors = [50]
+    iterations = 20
+    content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
+    collaborative = ['knn_eset','knn','knn_plus']
+    hybrid = ['knnco','knnco_eset']
+    profile_size = [10,20,40,60,80,100,140,170,200,240]
+    neighbors = [3,5,10,20,30,50,70,100,150,200]
+    
+    cfg = Config()
+    cfg.strategy = sys.argv[1]
+
+    #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
+    user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
+    #user = PopconSystem(sys.argv[1])
+    user.filter_pkg_profile(cfg.pkgs_filter)
+    user.maximal_pkg_profile()
+
+    if cfg.strategy in content_based:
+        run_content(user,cfg)
+    if cfg.strategy in collaborative:
+        run_collaborative(user,cfg)
+    if cfg.strategy in hybrid:
+        run_hybrid(user,cfg)
@@ -1,328 +0,0 @@
-#!/usr/bin/env python
-"""
-    recommender suite - recommender experiments suite 
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import sys
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-import numpy
-
-#iterations = 3
-#sample_proportions = [0.9]
-#weighting = [('bm25',1.2)]
-#collaborative = ['knn_eset']
-#content_based = ['cb']
-#hybrid = ['knnco']
-#profile_size = [50,100]
-#popcon_size = ["1000"]
-#neighbors = [50]
-
-iterations = 30
-sample_proportions = [0.9]
-weighting = [('bm25',1.0),('bm25',1.2),('bm25',2.0),('trad',0)]
-content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
-collaborative = ['knn_eset','knn','knn_plus']
-hybrid = ['knnco','knnco_eset']
-profile_size = range(20,200,20)
-neighbors = range(10,510,50)
-
-def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
-    # Write recall log
-    output = open(("%s-%.2d" % (log_file,n)),'w')
-    output.write("# %s-n\n" % label["description"])
-    output.write("# %s-%.2d\n" % (label["values"],n))
-    output.write("\n# repository profile sample\n%d %d %d\n" % \
-                 (repo_size,profile_size,len(sample)))
-    if hasattr(recommendation,"ranking"):
-        notfound = []
-        ranks = []
-        for pkg in sample.keys():
-            if pkg in recommendation.ranking:
-                ranks.append(recommendation.ranking.index(pkg))
-            else:
-                notfound.append(pkg)
-        for r in sorted(ranks):
-            output.write(str(r)+"\n")
-        if notfound:
-            output.write("# out of recommendation:\n")
-            for pkg in notfound:
-                output.write(pkg+"\n")
-    output.close()
-
-def plot_roc(roc_points,auc,eauc,c,p,log_file):
-    g = Gnuplot.Gnuplot()
-    g('set style data lines')
-    g.xlabel('False Positive Rate')
-    g.ylabel('True Positive Rate')
-    g('set xrange [0:1.0]')
-    g('set yrange [0:1.0]')
-    g.title("Setup: %s" % log_file.split("/")[-1])
-    g('set label "C %.2f" at 0.8,0.25' % c)
-    g('set label "P(20) %.2f" at 0.8,0.2' % p)
-    g('set label "AUC %.4f" at 0.8,0.15' % auc)
-    g('set label "EAUC %.4f" at 0.8,0.1' % eauc)
-    g.plot(Gnuplot.Data(roc_points,title="ROC"),
-           Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
-           Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
-    g.hardcopy(log_file+"-roc.png",terminal="png")
-    g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
-
-def plot_summary(precision,recall,f1,f05,accuracy,log_file):
-    # Plot metrics summary
-    g = Gnuplot.Gnuplot()
-    g('set style data lines')
-    g.xlabel('Recommendation size')
-    g.title("Setup: %s" % log_file.split("/")[-1])
-    g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
-           Gnuplot.Data(precision,title="Precision"),
-           Gnuplot.Data(recall,title="Recall"),
-           Gnuplot.Data(f1,title="F_1"),
-           Gnuplot.Data(f05,title="F_0.5"))
-    g.hardcopy(log_file+".png",terminal="png")
-    g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
-    g('set logscale x')
-    g('replot')
-    g.hardcopy(log_file+"-logscale.png",terminal="png")
-    g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
-
-def get_label(cfg,sample_proportion):
-    label = {}
-    if cfg.strategy in content_based:
-        label["description"] = "strategy-filter-profile-k1_bm25"
-        label["values"] = ("%s-profile%.3d-%s-kbm%.1f" %
-                           (cfg.strategy,cfg.profile_size,
-                            cfg.pkgs_filter.split("/")[-1],
-                            cfg.bm25_k1))
-    elif cfg.strategy in collaborative:
-       label["description"] = "strategy-knn-filter-k1_bm25"
-       label["values"] = ("%s-k%.3d-%s-kbm%.1f" %
-                          (cfg.strategy,cfg.k_neighbors,
-                           cfg.pkgs_filter.split("/")[-1],
-                           cfg.bm25_k1))
-    elif cfg.strategy in hybrid:
-       label["description"] = "strategy-knn-filter-profile-k1_bm25"
-       label["values"] = ("%s-k%.3d-profile%.3d-%s-kbm%.1f" %
-                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
-                           cfg.pkgs_filter.split("/")[-1],
-                           cfg.bm25_k1))
-    else:
-        print "Unknown strategy"
-    return label
-
-class ExperimentResults:
-    def __init__(self,repo_size):
-        self.repository_size = repo_size
-        self.accuracy = {}
-        self.precision = {}
-        self.recall = {}
-        self.f1 = {}
-        self.f05 = {}
-        self.fpr = {}
-        #points = [1]+range(10,200,10)+range(200,self.repository_size,100)
-        points = [1]+range(10,self.repository_size,10)
-        self.recommended = set()
-        for size in points:
-            self.accuracy[size] = []
-            self.precision[size] = []
-            self.recall[size] = []
-            self.f1[size] = []
-            self.f05[size] = []
-            self.fpr[size] = []
-
-    def add_result(self,ranking,sample):
-        print "len_recommended", len(self.recommended)
-        print "len_rank", len(ranking)
-        self.recommended = self.recommended.union(ranking)
-        print "len_recommended", len(self.recommended)
-        # get data only for point
-        for size in self.accuracy.keys():
-            predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
-            real = RecommendationResult(sample)
-            evaluation = Evaluation(predicted,real,self.repository_size)
-            #self.accuracy[size].append(evaluation.run(Accuracy()))
-            self.precision[size].append(evaluation.run(Precision()))
-            self.recall[size].append(evaluation.run(Recall()))
-            #self.f1[size].append(evaluation.run(F_score(1)))
-            #self.f05[size].append(evaluation.run(F_score(0.5)))
-            self.fpr[size].append(evaluation.run(FPR()))
-
-    # Average ROC by threshold (whici is the size)
-    def get_roc_points(self):
-        points = []
-        for size in self.recall.keys():
-            tpr = self.recall[size]
-            fpr = self.fpr[size]
-            points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
-        return sorted(points)
-
-    def get_precision_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
-        return sorted(summary)
-
-    def get_recall_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
-        return sorted(summary)
-
-    def get_f1_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
-        return sorted(summary)
-
-    def get_f05_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
-        return sorted(summary)
-
-    def get_accuracy_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
-        return sorted(summary)
-
-    def best_precision(self):
-        size = max(self.precision, key = lambda x: max(self.precision[x]) and x>10)
-        return (size,max(self.precision[size]))
-
-    def best_f1(self):
-        size = max(self.f1, key = lambda x: max(self.f1[x]))
-        return (size,max(self.f1[size]))
-
-    def best_f05(self):
-        size = max(self.f05, key = lambda x: max(self.f05[x]))
-        return (size,max(self.f05[size]))
-
-def run_strategy(cfg,user):
-    for weight in weighting:
-        cfg.weight = weight[0]
-        cfg.bm25_k1 = weight[1]
-        rec = Recommender(cfg)
-        repo_size = rec.items_repository.get_doccount()
-        for proportion in sample_proportions:
-            results = ExperimentResults(repo_size)
-            label = get_label(cfg,proportion)
-            #log_file = "results/20110906/4a67a295/"+label["values"]
-            log_file = "results/"+label["values"]
-            for n in range(iterations):
-                # Fill sample profile
-                profile_size = len(user.pkg_profile)
-                item_score = {}
-                for pkg in user.pkg_profile:
-                    item_score[pkg] = user.item_score[pkg]
-                sample = {}
-                sample_size = int(profile_size*proportion)
-                for i in range(sample_size):
-                     key = random.choice(item_score.keys())
-                     sample[key] = item_score.pop(key)
-                iteration_user = User(item_score)
-                recommendation = rec.get_recommendation(iteration_user,repo_size)
-                #write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
-                if hasattr(recommendation,"ranking"):
-                    results.add_result(recommendation.ranking,sample)
-            with open(log_file,'w') as f:
-                roc_points = results.get_roc_points()
-                x_coord = [p[0] for p in roc_points]
-                y_coord = [p[1] for p in roc_points]
-                auc = numpy.trapz(y=y_coord, x=x_coord)
-                eauc = (auc+
-                        numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
-                        numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
-                precision_20 = sum(results.precision[10])/len(results.precision[10])
-                print results.recommended
-                print "len",len(results.recommended)
-                coverage = len(results.recommended)/float(repo_size)
-                print "repo_size: ", float(repo_size)
-                print coverage
-                exit(1)
-                #f1_10 = sum(results.f1[10])/len(results.f1[10])
-                #f05_10 = sum(results.f05[10])/len(results.f05[10])
-                f.write("# %s\n# %s\n\n" %
-                        (label["description"],label["values"]))
-                f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
-                        (coverage,precision_20,auc,eauc))
-                #f.write("# best results (recommendation size; metric)\n")
-                #f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
-                #        (results.best_precision()[0],results.best_precision()[1],
-                #         results.best_f1()[0],results.best_f1()[1],
-                #         results.best_f05()[0],results.best_f05()[1]))
-                #f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
-                #        (precision_10,f1_10,f05_10))
-            #precision = results.get_precision_summary()
-            #recall = results.get_recall_summary()
-            #f1 = results.get_f1_summary()
-            #f05 = results.get_f05_summary()
-            #accuracy = results.get_accuracy_summary()
-            #plot_summary(precision,recall,f1,f05,accuracy,log_file)
-            plot_roc(roc_points,auc,eauc,coverage,precision_20,log_file)
-
-def run_content(user,cfg):
-    for strategy in content_based:
-        cfg.strategy = strategy
-        for size in profile_size:
-            cfg.profile_size = size
-            run_strategy(cfg,user)
-
-def run_collaborative(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in collaborative:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            #for size in popcon_size:
-            #    if size:
-            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
-            #        cfg.popcon_programs = popcon_programs+"_"+size
-            run_strategy(cfg,user)
-
-def run_hybrid(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in hybrid:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            #for size in popcon_size:
-            #    if size:
-            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
-            #        cfg.popcon_programs = popcon_programs+"_"+size
-            for size in profile_size:
-                cfg.profile_size = size
-                run_strategy(cfg,user)
-
-if __name__ == '__main__':
-    #user = LocalSystem()
-    #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
-
-    cfg = Config()
-    #user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
-    user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
-    #user =  PopconSystem("/root/.app-recommender/popcon-entries/4a/4a5834eb2aba6b6f17312239e0761c70")
-    user.filter_pkg_profile(cfg.pkgs_filter)
-    user.maximal_pkg_profile()
-
-    if "content" in sys.argv or len(sys.argv)<2:
-        run_content(user,cfg)
-    if "collaborative" in sys.argv or len(sys.argv)<2:
-        run_collaborative(user,cfg)
-    if "hybrid" in sys.argv or len(sys.argv)<2:
-        run_hybrid(user,cfg)
@@ -1,171 +0,0 @@
-#!/usr/bin/env python
-"""
-    recommender suite - recommender experiments suite 
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import expsuite
-import sys
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-
-class ClusteringSuite(expsuite.PyExperimentSuite):
-    def reset(self, params, rep):
-        self.cfg = Config()
-        self.cfg.popcon_index = "../tests/test_data/.sample_pxi"
-        self.cfg.popcon_dir = "../tests/test_data/popcon_dir"
-        self.cfg.clusters_dir = "../tests/test_data/clusters_dir"
-
-        if params['name'] == "clustering":
-            logging.info("Starting 'clustering' experiments suite...")
-            self.cfg.index_mode = "recluster"
-
-    def iterate(self, params, rep, n):
-        if params['name'] == "clustering":
-            logging.info("Running iteration %d" % params['medoids'][n])
-            self.cfg.k_medoids = params['medoids'][n]
-            pxi = PopconXapianIndex(self.cfg)
-            result = {'k_medoids': params['medoids'][n],
-                   'dispersion': pxi.cluster_dispersion}
-        else:
-            result = {}
-        return result
-
-class ContentBasedSuite(expsuite.PyExperimentSuite):
-    def reset(self, params, rep):
-        if params['name'].startswith("content"):
-            cfg = Config()
-            #if the index was not built yet
-            #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi")
-            cfg.axi = "data/AppAxi"
-            cfg.index_mode = "old"
-            cfg.weight = params['weight']
-            self.rec = Recommender(cfg)
-            self.rec.set_strategy(params['strategy'])
-            self.repo_size = self.rec.items_repository.get_doccount()
-            self.user = LocalSystem()
-            self.user.app_pkg_profile(self.rec.items_repository)
-            self.user.no_auto_pkg_profile()
-            self.sample_size = int(len(self.user.pkg_profile)*params['sample'])
-            # iteration should be set to 10 in config file
-            #self.profile_size = range(10,101,10)
-
-    def iterate(self, params, rep, n):
-        if params['name'].startswith("content"):
-            item_score = dict.fromkeys(self.user.pkg_profile,1)
-            # Prepare partition
-            sample = {}
-            for i in range(self.sample_size):
-                 key = random.choice(item_score.keys())
-                 sample[key] = item_score.pop(key)
-            # Get full recommendation
-            user = User(item_score)
-            recommendation = self.rec.get_recommendation(user,self.repo_size)
-            # Write recall log
-            recall_file = "results/content/recall/%s-%s-%.2f-%d" % \
-                          (params['strategy'],params['weight'],params['sample'],n)
-            output = open(recall_file,'w')
-            output.write("# weight=%s\n" % params['weight'])
-            output.write("# strategy=%s\n" % params['strategy'])
-            output.write("# sample=%f\n" % params['sample'])
-            output.write("\n%d %d %d\n" % \
-                         (self.repo_size,len(item_score),self.sample_size))
-            notfound = []
-            ranks = []
-            for pkg in sample.keys():
-                if pkg in recommendation.ranking:
-                    ranks.append(recommendation.ranking.index(pkg))
-                else:
-                    notfound.append(pkg)
-            for r in sorted(ranks):
-                output.write(str(r)+"\n")
-            if notfound:
-                output.write("Out of recommendation:\n")
-                for pkg in notfound:
-                    output.write(pkg+"\n")
-            output.close()
-            # Plot metrics summary
-            accuracy = []
-            precision = []
-            recall = []
-            f1 = []
-            g = Gnuplot.Gnuplot()
-            g('set style data lines')
-            g.xlabel('Recommendation size')
-            for size in range(1,len(recommendation.ranking)+1,100):
-                predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
-                real = RecommendationResult(sample)
-                evaluation = Evaluation(predicted,real,self.repo_size)
-                accuracy.append([size,evaluation.run(Accuracy())])
-                precision.append([size,evaluation.run(Precision())])
-                recall.append([size,evaluation.run(Recall())])
-                f1.append([size,evaluation.run(F1())])
-            g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
-                   Gnuplot.Data(precision,title="Precision"),
-                   Gnuplot.Data(recall,title="Recall"),
-                   Gnuplot.Data(f1,title="F1"))
-            g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1)
-            # Iteration log
-            result = {'iteration': n,
-                      'weight': params['weight'],
-                      'strategy': params['strategy'],
-                      'accuracy': accuracy[20],
-                      'precision': precision[20],
-                      'recall:': recall[20],
-                      'f1': f1[20]}
-            return result
-
-#class CollaborativeSuite(expsuite.PyExperimentSuite):
-#    def reset(self, params, rep):
-#        if params['name'].startswith("collaborative"):
-#
-#    def iterate(self, params, rep, n):
-#        if params['name'].startswith("collaborative"):
-#            for root, dirs, files in os.walk(self.source_dir):
-#                for popcon_file in files:
-#                    submission = PopconSubmission(os.path.join(root,popcon_file))
-#                    user = User(submission.packages)
-#                    user.maximal_pkg_profile()
-#                    rec.get_recommendation(user)
-#                    precision = 0
-#                    result = {'weight': params['weight'],
-#                              'strategy': params['strategy'],
-#                              'profile_size': self.profile_size[n],
-#                              'accuracy': accuracy,
-#                              'precision': precision,
-#                              'recall:': recall,
-#                              'f1': }
-#        else:
-#            result = {}
-#        return result
-
-if __name__ == '__main__':
-
-    if "clustering" in sys.argv or len(sys.argv)<3:
-        ClusteringSuite().start()
-    if "content" in sys.argv or len(sys.argv)<3:
-        ContentBasedSuite().start()
-    #if "collaborative" in sys.argv or len(sys.argv)<3:
-    #CollaborativeSuite().start()
@@ -0,0 +1,44 @@
+#! /usr/bin/env python
+"""
+    sample-popcon-arch - extract a sample of a specific arch
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+import sys
+sys.path.insert(0,'../')
+import xapian
+import os
+import random
+import sys
+from user import RandomPopcon
+
+if __name__ == '__main__':
+    try:
+        size = int(sys.argv[1])
+        arch = sys.argv[2]
+        popcon_dir = sys.argv[3]
+        pkgs_filter = sys.argv[4]
+    except:
+        print "Usage: sample-popcon-arch size arch popcon_dir pkgs_filter"
+        exit(1)
+
+    sample_file = ("results/misc-popcon/sample-%s-%d" % (arch,size))
+    with open(sample_file,'w') as f:
+        for n in range(1,size+1):
+            user = RandomPopcon(popcon_dir,arch,pkgs_filter)
+            f.write(user.user_id+'\n')
+            print "sample",n
@@ -1,274 +0,0 @@
-#!/usr/bin/env python
-"""
-    recommender suite - recommender experiments suite 
-"""
-__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
-__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
-__license__ = """
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import sys
-sys.path.insert(0,'../')
-from config import Config
-from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
-from recommender import Recommender
-from user import LocalSystem, User
-from evaluation import *
-import logging
-import random
-import Gnuplot
-
-#iterations = 3
-#sample_proportions = [0.9]
-#weighting = [('bm25',1.2)]
-#collaborative = ['knn']
-#content_based = []
-#hybrid = ['knnco']
-#profile_size = [50,100]
-#popcon_size = ["1000"]
-#neighbors = [50]
-
-iterations = 10
-sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
-weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
-content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
-collaborative = ['knn_eset','knn','knn_plus']
-hybrid = ['knnco','knnco_eset']
-
-profile_size = range(20,100,20)
-#popcon_size = [1000,10000,50000,'full']
-neighbors = range(10,510,50)
-
-def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
-    # Write recall log
-    output = open(("%s-%d" % (log_file,n)),'w')
-    output.write("# %s-n\n" % label["description"])
-    output.write("# %s-%d\n" % (label["values"],n))
-    output.write("\n%d %d %d\n" % \
-                 (repo_size,profile_size,len(sample)))
-    if hasattr(recommendation,"ranking"):
-        notfound = []
-        ranks = []
-        for pkg in sample.keys():
-            if pkg in recommendation.ranking:
-                ranks.append(recommendation.ranking.index(pkg))
-            else:
-                notfound.append(pkg)
-        for r in sorted(ranks):
-            output.write(str(r)+"\n")
-        if notfound:
-            output.write("Out of recommendation:\n")
-            for pkg in notfound:
-                output.write(pkg+"\n")
-    output.close()
-
-def plot_summary(precision,recall,f1,f05,accuracy,log_file):
-    # Plot metrics summary
-    g = Gnuplot.Gnuplot()
-    g('set style data lines')
-    g.xlabel('Recommendation size')
-    g.title("Setup: %s" % log_file.split("/")[-1])
-    g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
-           Gnuplot.Data(precision,title="Precision"),
-           Gnuplot.Data(recall,title="Recall"),
-           Gnuplot.Data(f1,title="F_1"),
-           Gnuplot.Data(f05,title="F_0.5"))
-    g.hardcopy(log_file+".png",terminal="png")
-    g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
-    g('set logscale x')
-    g('replot')
-    g.hardcopy(log_file+"-logscale.png",terminal="png")
-    g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
-
-def get_label(cfg,sample_proportion):
-    label = {}
-    if cfg.strategy in content_based:
-        label["description"] = "strategy-filter-profile-k1_bm25-sample"
-        label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
-                           (cfg.strategy,cfg.profile_size,
-                            cfg.pkgs_filter.split("/")[-1],
-                            cfg.bm25_k1,sample_proportion))
-    elif cfg.strategy in collaborative:
-       label["description"] = "strategy-knn-filter-k1_bm25-sample"
-       label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
-                          (cfg.strategy,cfg.k_neighbors,
-                           cfg.pkgs_filter.split("/")[-1],
-                           cfg.bm25_k1,sample_proportion))
-    elif cfg.strategy in hybrid:
-       label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
-       label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
-                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
-                           cfg.pkgs_filter.split("/")[-1],
-                           cfg.bm25_k1,sample_proportion))
-    else:
-        print "Unknown strategy"
-    return label
-
-class ExperimentResults:
-    def __init__(self,repo_size):
-        self.repository_size = repo_size
-        self.accuracy = {}
-        self.precision = {}
-        self.recall = {}
-        self.f1 = {}
-        self.f05 = {}
-        points = [1]+range(10,200,10)+range(200,self.repository_size,100)
-        for size in points:
-            self.accuracy[size] = []
-            self.precision[size] = []
-            self.recall[size] = []
-            self.f1[size] = []
-            self.f05[size] = []
-
-    def add_result(self,ranking,sample):
-        for size in self.accuracy.keys():
-            predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
-            real = RecommendationResult(sample)
-            evaluation = Evaluation(predicted,real,self.repository_size)
-            self.accuracy[size].append(evaluation.run(Accuracy()))
-            self.precision[size].append(evaluation.run(Precision()))
-            self.recall[size].append(evaluation.run(Recall()))
-            self.f1[size].append(evaluation.run(F_score(1)))
-            self.f05[size].append(evaluation.run(F_score(0.5)))
-
-    def get_precision_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
-        return sorted(summary)
-
-    def get_recall_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
-        return sorted(summary)
-
-    def get_f1_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
-        return sorted(summary)
-
-    def get_f05_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
-        return sorted(summary)
-
-    def get_accuracy_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
-        return sorted(summary)
-
-    def best_precision(self):
-        size = max(self.precision, key = lambda x: max(self.precision[x]))
-        return (size,max(self.precision[size]))
-
-    def best_f1(self):
-        size = max(self.f1, key = lambda x: max(self.f1[x]))
-        return (size,max(self.f1[size]))
-
-    def best_f05(self):
-        size = max(self.f05, key = lambda x: max(self.f05[x]))
-        return (size,max(self.f05[size]))
-
-def run_strategy(cfg,user):
-    for weight in weighting:
-        cfg.weight = weight[0]
-        cfg.bm25_k1 = weight[1]
-        rec = Recommender(cfg)
-        repo_size = rec.items_repository.get_doccount()
-        for proportion in sample_proportions:
-            results = ExperimentResults(repo_size)
-            label = get_label(cfg,proportion)
-            log_file = "results/strategies/"+label["values"]
-            for n in range(iterations):
-                # Fill sample profile
-                profile_size = len(user.pkg_profile)
-                item_score = {}
-                for pkg in user.pkg_profile:
-                    item_score[pkg] = user.item_score[pkg]
-                sample = {}
-                sample_size = int(profile_size*proportion)
-                for i in range(sample_size):
-                     key = random.choice(item_score.keys())
-                     sample[key] = item_score.pop(key)
-                iteration_user = User(item_score)
-                recommendation = rec.get_recommendation(iteration_user,repo_size)
-                write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
-                if hasattr(recommendation,"ranking"):
-                    results.add_result(recommendation.ranking,sample)
-            with open(log_file,'w') as f:
-                precision_10 = sum(results.precision[10])/len(results.precision[10])
-                f1_10 = sum(results.f1[10])/len(results.f1[10])
-                f05_10 = sum(results.f05[10])/len(results.f05[10])
-                f.write("# %s\n# %s\n\ncoverage %d\n\n" %
-                        (label["description"],label["values"],recommendation.size))
-                f.write("# best results (recommendation size; metric)\n")
-                f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
-                        (results.best_precision()[0],results.best_precision()[1],
-                         results.best_f1()[0],results.best_f1()[1],
-                         results.best_f05()[0],results.best_f05()[1]))
-                f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
-                        (precision_10,f1_10,f05_10))
-            precision = results.get_precision_summary()
-            recall = results.get_recall_summary()
-            f1 = results.get_f1_summary()
-            f05 = results.get_f05_summary()
-            accuracy = results.get_accuracy_summary()
-            plot_summary(precision,recall,f1,f05,accuracy,log_file)
-
-def run_content(user,cfg):
-    for strategy in content_based:
-        cfg.strategy = strategy
-        for size in profile_size:
-            cfg.profile_size = size
-            run_strategy(cfg,user)
-
-def run_collaborative(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in collaborative:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            #for size in popcon_size:
-            #    if size:
-            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
-            #        cfg.popcon_programs = popcon_programs+"_"+size
-            run_strategy(cfg,user)
-
-def run_hybrid(user,cfg):
-    popcon_desktopapps = cfg.popcon_desktopapps
-    popcon_programs = cfg.popcon_programs
-    for strategy in hybrid:
-        cfg.strategy = strategy
-        for k in neighbors:
-            cfg.k_neighbors = k
-            #for size in popcon_size:
-            #    if size:
-            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
-            #        cfg.popcon_programs = popcon_programs+"_"+size
-            for size in profile_size:
-                cfg.profile_size = size
-                run_strategy(cfg,user)
-
-if __name__ == '__main__':
-    #user = LocalSystem()
-    #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
-
-    cfg = Config()
-    user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
-    #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
-    user.filter_pkg_profile(cfg.pkgs_filter)
-    user.maximal_pkg_profile()
-
-    if "content" in sys.argv or len(sys.argv)<2:
-        run_content(user,cfg)
-    if "collaborative" in sys.argv or len(sys.argv)<2:
-        run_collaborative(user,cfg)
-    if "hybrid" in sys.argv or len(sys.argv)<2:
-        run_hybrid(user,cfg)
@@ -111,7 +111,7 @@ class User:
     """
     Define a user of a recommender.
     """
-    def __init__(self,item_score,user_id=0,demo_profiles_set=0):
+    def __init__(self,item_score,user_id=0,arch=0,demo_profiles_set=0):
         """
         Set initial user attributes. pkg_profile gets the whole set of items,
         a random user_id is set if none was provided and the demographic
@@ -119,6 +119,7 @@ class User:
         """
         self.item_score = item_score
         self.pkg_profile = self.items()
+        self.arch = arch
         if user_id:
             self.user_id = user_id
@@ -272,21 +273,28 @@ class User:
         return self.pkg_profile
 class RandomPopcon(User):
-    def __init__(self,submissions_dir,pkgs_filter=0):
+    def __init__(self,submissions_dir,arch=0,pkgs_filter=0):
         """
         Set initial parameters.
         """
         len_profile = 0
-        while len_profile < 100:
+        match_arch = False
+        while len_profile < 100 or not match_arch:
             path = random.choice([os.path.join(root, submission) for
                                   root, dirs, files in os.walk(submissions_dir)
                                   for submission in files])
             user = PopconSystem(path)
+            print arch
+            print user.arch
+            if arch and user.arch==arch:
+                match_arch = True
+                print "match"
             if pkgs_filter:
                 user.filter_pkg_profile(pkgs_filter)
             len_profile = len(user.pkg_profile)
+            print "p",len_profile
         submission = data.PopconSubmission(path)
-        User.__init__(self,submission.packages,submission.user_id)
+        User.__init__(self,submission.packages,submission.user_id,submission.arch)
 class PopconSystem(User):
     def __init__(self,path,user_id=0):
@@ -296,7 +304,7 @@ class PopconSystem(User):
         submission = data.PopconSubmission(path)
         if not user_id:
             user_id = submission.user_id
-        User.__init__(self,submission.packages,user_id)
+        User.__init__(self,submission.packages,user_id,submission.arch)
 class PkgsListSystem(User):
     def __init__(self,pkgs_list_or_file,user_id=0):
	@@ -37,7 +37,7 @@ if __name__ == '__main__':		@@ -37,7 +37,7 @@ if __name__ == '__main__':
37	#user = LocalSystem()	37	#user = LocalSystem()
38	#user = RandomPopcon(cfg.popcon_dir)	38	#user = RandomPopcon(cfg.popcon_dir)
39	#user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))	39	#user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
40	- user = PopconSystem("/home/tassia/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")	40	+ user = PopconSystem(os.path.expanduser("~/.app-recommender/popcon-entries/00/0001166d0737c6dffb083071e5ee69f5"))
41	user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps"))	41	user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps"))
42	user.maximal_pkg_profile()	42	user.maximal_pkg_profile()
43	begin_time = datetime.datetime.now()	43	begin_time = datetime.datetime.now()
	@@ -48,7 +48,7 @@ if __name__ == '__main__':		@@ -48,7 +48,7 @@ if __name__ == '__main__':
48	metrics.append(F_score(0.5))	48	metrics.append(F_score(0.5))
49	metrics.append(Accuracy())	49	metrics.append(Accuracy())
50	metrics.append(FPR())	50	metrics.append(FPR())
51	- validation = CrossValidation(0.9,10,rec,metrics,1)	51	+ validation = CrossValidation(0.9,20,rec,metrics,0.005)
52	validation.run(user)	52	validation.run(user)
53	print validation	53	print validation
54		54
@@ -0,0 +1,42 @@		@@ -0,0 +1,42 @@
	1	+#!/usr/bin/env python
	2	+"""
	3	+ AppRecommender - A GNU/Linux application recommender
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import os
	23	+import sys
	24	+sys.path.insert(0,'../')
	25	+import xapian
	26	+
	27	+if __name__ == '__main__':
	28	+ if len(sys.argv)<2:
	29	+ print "Usage: get_axipkgs index_path"
	30	+ exit(1)
	31	+
	32	+ axi_path = sys.argv[1]
	33	+ axi = xapian.Database(axi_path)
	34	+ for n in range(1,axi.get_lastdocid()):
	35	+ doc = 0
	36	+ try:
	37	+ doc = axi.get_document(n)
	38	+ except:
	39	+ pass
	40	+ if doc:
	41	+ xp_terms = [t.term for t in doc.termlist() if t.term.startswith("XP")]
	42	+ print xp_terms[0].lstrip('XP')
1	#!/usr/bin/env bash	1	#!/usr/bin/env bash
2	#	2	#
3	-# get_desktop.sh - get packages which have desktop files	3	+# get_desktop.sh - get packages which have desktop files
		4	+#
		5	+# DEPRECATED: use get_axipkgs.py to get this info from axi
4		6
5	cd /usr/share/app-install/desktop	7	cd /usr/share/app-install/desktop
6	sed -ne 's/X-AppInstall-Package=//p' * \| sort -u \| grep -v kdelibs \| grep -v libfm-gtk0	8	sed -ne 's/X-AppInstall-Package=//p' * \| sort -u \| grep -v kdelibs \| grep -v libfm-gtk0
	@@ -140,6 +140,29 @@ class FPR(Metric):		@@ -140,6 +140,29 @@ class FPR(Metric):
140	return (float(len(evaluation.false_positive))/	140	return (float(len(evaluation.false_positive))/
141	evaluation.real_negative_len)	141	evaluation.real_negative_len)
142		142
		143	+class MCC(Metric):
		144	+ """
		145	+ Matthews correlation coefficient.
		146	+ """
		147	+ def __init__(self):
		148	+ """
		149	+ Set metric description.
		150	+ """
		151	+ self.desc = " MCC "
		152	+
		153	+ def run(self,evaluation):
		154	+ """
		155	+ Compute metric.
		156	+ """
		157	+ VP = len(evaluation.true_positive)
		158	+ FP = len(evaluation.false_positive)
		159	+ FN = len(evaluation.false_negative)
		160	+ VN = evaluation.true_negative_len
		161	+ if (VP+FP)==0 or (VP+FN)==0 or (VN+FP)==0 or (VN+FN)==0:
		162	+ return 0
		163	+ MCC = (((VPVN)-(FPFN))/math.sqrt((VP+FP)(VP+FN)(VN+FP)*(VN+FN)))
		164	+ return MCC
		165	+
143	class F_score(Metric):	166	class F_score(Metric):
144	"""	167	"""
145	Classification accuracy metric which correlates precision and recall into an	168	Classification accuracy metric which correlates precision and recall into an
1	-Experiments handled by expsuite:
2	-https://github.com/rueckstiess/expsuite	1	+AppRecommender experiments and tests
		2	+---------------------------------------
		3	+
		4	+Install dependencies:
		5	+
		6	+# apt-get install \
		7	+python-unittest2 python-gnuplot gnuplot
		8	+
		9	+# cd ./src; git clone https://github.com/rueckstiess/expsuite (deprecated tests)
@@ -0,0 +1,186 @@		@@ -0,0 +1,186 @@
	1	+#!/usr/bin/env python
	2	+"""
	3	+ k-suite - experiment different neighborhood sizes
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import sys
	23	+sys.path.insert(0,'../')
	24	+from config import Config
	25	+from data import PopconXapianIndex, PopconSubmission
	26	+from recommender import Recommender
	27	+from user import LocalSystem, User
	28	+from evaluation import *
	29	+import logging
	30	+import random
	31	+import Gnuplot
	32	+import numpy
	33	+
	34	+def plot_roc(k,roc_points,log_file):
	35	+ g = Gnuplot.Gnuplot()
	36	+ g('set style data points')
	37	+ g.xlabel('False Positive Rate')
	38	+ g.ylabel('True Positive Rate')
	39	+ g('set xrange [0:1.0]')
	40	+ g('set yrange [0:1.0]')
	41	+ g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
	42	+ g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
	43	+ Gnuplot.Data(roc_points))
	44	+ g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
	45	+ g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
	46	+
	47	+def plot_summary(precision,f05,mcc,log_file):
	48	+ g = Gnuplot.Gnuplot()
	49	+ g('set style data lines')
	50	+ g.xlabel('Neighborhood (k)')
	51	+ g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
	52	+ g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
	53	+ Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
	54	+ Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
	55	+ g.hardcopy(log_file+(".png"),terminal="png")
	56	+ g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
	57	+
	58	+class ExperimentResults:
	59	+ def __init__(self,repo_size):
	60	+ self.repository_size = repo_size
	61	+ self.precision = []
	62	+ self.recall = []
	63	+ self.fpr = []
	64	+ self.f05 = []
	65	+ self.mcc = []
	66	+
	67	+ def add_result(self,ranking,sample):
	68	+ predicted = RecommendationResult(dict.fromkeys(ranking,1))
	69	+ real = RecommendationResult(sample)
	70	+ evaluation = Evaluation(predicted,real,self.repository_size)
	71	+ self.precision.append(evaluation.run(Precision()))
	72	+ self.recall.append(evaluation.run(Recall()))
	73	+ self.fpr.append(evaluation.run(FPR()))
	74	+ self.f05.append(evaluation.run(F_score(0.5)))
	75	+ self.mcc.append(evaluation.run(MCC()))
	76	+
	77	+ def get_roc_point(self):
	78	+ tpr = self.recall
	79	+ fpr = self.fpr
	80	+ if not tpr or not fpr:
	81	+ return [0,0]
	82	+ return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
	83	+
	84	+ def get_precision_summary(self):
	85	+ if not self.precision: return 0
	86	+ return sum(self.precision)/len(self.precision)
	87	+
	88	+ def get_f05_summary(self):
	89	+ if not self.f05: return 0
	90	+ return sum(self.f05)/len(self.f05)
	91	+
	92	+ def get_mcc_summary(self):
	93	+ if not self.mcc: return 0
	94	+ return sum(self.mcc)/len(self.mcc)
	95	+
	96	+if __name__ == '__main__':
	97	+ if len(sys.argv)<3:
	98	+ print "Usage: k-suite strategy_str sample_file"
	99	+ exit(1)
	100	+ threshold = 20
	101	+ iterations = 30
	102	+ neighbors = [3,5,10,50,100,150,200,300,400,500]
	103	+ cfg = Config()
	104	+ cfg.strategy = sys.argv[1]
	105	+ sample_file = sys.argv[2]
	106	+ population_sample = []
	107	+ with open(sample_file,'r') as f:
	108	+ for line in f.readlines():
	109	+ user_id = line.strip('\n')
	110	+ population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
	111	+ # setup dictionaries and files
	112	+ roc_summary = {}
	113	+ recommended = {}
	114	+ precision_summary = {}
	115	+ f05_summary = {}
	116	+ mcc_summary = {}
	117	+ sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
	118	+ if not os.path.exists(sample_dir):
	119	+ os.makedirs(sample_dir)
	120	+ log_file = os.path.join(sample_dir,cfg.strategy)
	121	+ with open(log_file,'w') as f:
	122	+ f.write("# %s\n\n" % sample_file.split('/')[-1])
	123	+ f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
	124	+ (cfg.strategy,threshold,iterations))
	125	+ f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
	126	+
	127	+ for k in neighbors:
	128	+ roc_summary[k] = []
	129	+ recommended[k] = set()
	130	+ precision_summary[k] = []
	131	+ f05_summary[k] = []
	132	+ mcc_summary[k] = []
	133	+ with open(log_file+"-k%.3d"%k,'w') as f:
	134	+ f.write("# %s\n\n" % sample_file.split('/')[-1])
	135	+ f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
	136	+ f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
	137	+
	138	+ # main loop per user
	139	+ for submission_file in population_sample:
	140	+ user = PopconSystem(submission_file)
	141	+ user.filter_pkg_profile(cfg.pkgs_filter)
	142	+ user.maximal_pkg_profile()
	143	+ for k in neighbors:
	144	+ cfg.k_neighbors = k
	145	+ rec = Recommender(cfg)
	146	+ repo_size = rec.items_repository.get_doccount()
	147	+ results = ExperimentResults(repo_size)
	148	+ # n iterations for same recommender and user
	149	+ for n in range(iterations):
	150	+ # Fill sample profile
	151	+ profile_len = len(user.pkg_profile)
	152	+ item_score = {}
	153	+ for pkg in user.pkg_profile:
	154	+ item_score[pkg] = user.item_score[pkg]
	155	+ sample = {}
	156	+ sample_size = int(profile_len*0.9)
	157	+ for i in range(sample_size):
	158	+ key = random.choice(item_score.keys())
	159	+ sample[key] = item_score.pop(key)
	160	+ iteration_user = User(item_score)
	161	+ recommendation = rec.get_recommendation(iteration_user,threshold)
	162	+ if hasattr(recommendation,"ranking"):
	163	+ results.add_result(recommendation.ranking,sample)
	164	+ recommended[k] = recommended[k].union(recommendation.ranking)
	165	+ # save summary
	166	+ roc_point = results.get_roc_point()
	167	+ roc_summary[k].append(roc_point)
	168	+ precision = results.get_precision_summary()
	169	+ precision_summary[k].append(precision)
	170	+ f05 = results.get_f05_summary()
	171	+ f05_summary[k].append(f05)
	172	+ mcc = results.get_mcc_summary()
	173	+ mcc_summary[k].append(mcc)
	174	+ with open(log_file+"-k%.3d"%k,'a') as f:
	175	+ f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
	176	+ (roc_point[0],roc_point[1],precision,f05,mcc))
	177	+ # back to main flow
	178	+ with open(log_file,'a') as f:
	179	+ plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
	180	+ for k in neighbors:
	181	+ coverage = len(recommended[size])/float(repo_size)
	182	+ f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
	183	+ (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
	184	+ float(sum(f05_summary[k]))/len(f05_summary[k]),
	185	+ float(sum(mcc_summary[k]))/len(mcc_summary[k])))
	186	+ plot_roc(k,roc_summary[k],log_file)
@@ -0,0 +1,274 @@		@@ -0,0 +1,274 @@
	1	+#!/usr/bin/env python
	2	+"""
	3	+ recommender suite - recommender experiments suite
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import sys
	23	+sys.path.insert(0,'../')
	24	+from config import Config
	25	+from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
	26	+from recommender import Recommender
	27	+from user import LocalSystem, User
	28	+from evaluation import *
	29	+import logging
	30	+import random
	31	+import Gnuplot
	32	+
	33	+#iterations = 3
	34	+#sample_proportions = [0.9]
	35	+#weighting = [('bm25',1.2)]
	36	+#collaborative = ['knn']
	37	+#content_based = []
	38	+#hybrid = ['knnco']
	39	+#profile_size = [50,100]
	40	+#popcon_size = ["1000"]
	41	+#neighbors = [50]
	42	+
	43	+iterations = 10
	44	+sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
	45	+weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
	46	+content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
	47	+collaborative = ['knn_eset','knn','knn_plus']
	48	+hybrid = ['knnco','knnco_eset']
	49	+
	50	+profile_size = range(20,100,20)
	51	+#popcon_size = [1000,10000,50000,'full']
	52	+neighbors = range(10,510,50)
	53	+
	54	+def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
	55	+ # Write recall log
	56	+ output = open(("%s-%d" % (log_file,n)),'w')
	57	+ output.write("# %s-n\n" % label["description"])
	58	+ output.write("# %s-%d\n" % (label["values"],n))
	59	+ output.write("\n%d %d %d\n" % \
	60	+ (repo_size,profile_size,len(sample)))
	61	+ if hasattr(recommendation,"ranking"):
	62	+ notfound = []
	63	+ ranks = []
	64	+ for pkg in sample.keys():
	65	+ if pkg in recommendation.ranking:
	66	+ ranks.append(recommendation.ranking.index(pkg))
	67	+ else:
	68	+ notfound.append(pkg)
	69	+ for r in sorted(ranks):
	70	+ output.write(str(r)+"\n")
	71	+ if notfound:
	72	+ output.write("Out of recommendation:\n")
	73	+ for pkg in notfound:
	74	+ output.write(pkg+"\n")
	75	+ output.close()
	76	+
	77	+def plot_summary(precision,recall,f1,f05,accuracy,log_file):
	78	+ # Plot metrics summary
	79	+ g = Gnuplot.Gnuplot()
	80	+ g('set style data lines')
	81	+ g.xlabel('Recommendation size')
	82	+ g.title("Setup: %s" % log_file.split("/")[-1])
	83	+ g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
	84	+ Gnuplot.Data(precision,title="Precision"),
	85	+ Gnuplot.Data(recall,title="Recall"),
	86	+ Gnuplot.Data(f1,title="F_1"),
	87	+ Gnuplot.Data(f05,title="F_0.5"))
	88	+ g.hardcopy(log_file+".png",terminal="png")
	89	+ g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
	90	+ g('set logscale x')
	91	+ g('replot')
	92	+ g.hardcopy(log_file+"-logscale.png",terminal="png")
	93	+ g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
	94	+
	95	+def get_label(cfg,sample_proportion):
	96	+ label = {}
	97	+ if cfg.strategy in content_based:
	98	+ label["description"] = "strategy-filter-profile-k1_bm25-sample"
	99	+ label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
	100	+ (cfg.strategy,cfg.profile_size,
	101	+ cfg.pkgs_filter.split("/")[-1],
	102	+ cfg.bm25_k1,sample_proportion))
	103	+ elif cfg.strategy in collaborative:
	104	+ label["description"] = "strategy-knn-filter-k1_bm25-sample"
	105	+ label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
	106	+ (cfg.strategy,cfg.k_neighbors,
	107	+ cfg.pkgs_filter.split("/")[-1],
	108	+ cfg.bm25_k1,sample_proportion))
	109	+ elif cfg.strategy in hybrid:
	110	+ label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
	111	+ label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
	112	+ (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
	113	+ cfg.pkgs_filter.split("/")[-1],
	114	+ cfg.bm25_k1,sample_proportion))
	115	+ else:
	116	+ print "Unknown strategy"
	117	+ return label
	118	+
	119	+class ExperimentResults:
	120	+ def __init__(self,repo_size):
	121	+ self.repository_size = repo_size
	122	+ self.accuracy = {}
	123	+ self.precision = {}
	124	+ self.recall = {}
	125	+ self.f1 = {}
	126	+ self.f05 = {}
	127	+ points = [1]+range(10,200,10)+range(200,self.repository_size,100)
	128	+ for size in points:
	129	+ self.accuracy[size] = []
	130	+ self.precision[size] = []
	131	+ self.recall[size] = []
	132	+ self.f1[size] = []
	133	+ self.f05[size] = []
	134	+
	135	+ def add_result(self,ranking,sample):
	136	+ for size in self.accuracy.keys():
	137	+ predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
	138	+ real = RecommendationResult(sample)
	139	+ evaluation = Evaluation(predicted,real,self.repository_size)
	140	+ self.accuracy[size].append(evaluation.run(Accuracy()))
	141	+ self.precision[size].append(evaluation.run(Precision()))
	142	+ self.recall[size].append(evaluation.run(Recall()))
	143	+ self.f1[size].append(evaluation.run(F_score(1)))
	144	+ self.f05[size].append(evaluation.run(F_score(0.5)))
	145	+
	146	+ def get_precision_summary(self):
	147	+ summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
	148	+ return sorted(summary)
	149	+
	150	+ def get_recall_summary(self):
	151	+ summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
	152	+ return sorted(summary)
	153	+
	154	+ def get_f1_summary(self):
	155	+ summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
	156	+ return sorted(summary)
	157	+
	158	+ def get_f05_summary(self):
	159	+ summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
	160	+ return sorted(summary)
	161	+
	162	+ def get_accuracy_summary(self):
	163	+ summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
	164	+ return sorted(summary)
	165	+
	166	+ def best_precision(self):
	167	+ size = max(self.precision, key = lambda x: max(self.precision[x]))
	168	+ return (size,max(self.precision[size]))
	169	+
	170	+ def best_f1(self):
	171	+ size = max(self.f1, key = lambda x: max(self.f1[x]))
	172	+ return (size,max(self.f1[size]))
	173	+
	174	+ def best_f05(self):
	175	+ size = max(self.f05, key = lambda x: max(self.f05[x]))
	176	+ return (size,max(self.f05[size]))
	177	+
	178	+def run_strategy(cfg,user):
	179	+ for weight in weighting:
	180	+ cfg.weight = weight[0]
	181	+ cfg.bm25_k1 = weight[1]
	182	+ rec = Recommender(cfg)
	183	+ repo_size = rec.items_repository.get_doccount()
	184	+ for proportion in sample_proportions:
	185	+ results = ExperimentResults(repo_size)
	186	+ label = get_label(cfg,proportion)
	187	+ log_file = "results/strategies/"+label["values"]
	188	+ for n in range(iterations):
	189	+ # Fill sample profile
	190	+ profile_size = len(user.pkg_profile)
	191	+ item_score = {}
	192	+ for pkg in user.pkg_profile:
	193	+ item_score[pkg] = user.item_score[pkg]
	194	+ sample = {}
	195	+ sample_size = int(profile_size*proportion)
	196	+ for i in range(sample_size):
	197	+ key = random.choice(item_score.keys())
	198	+ sample[key] = item_score.pop(key)
	199	+ iteration_user = User(item_score)
	200	+ recommendation = rec.get_recommendation(iteration_user,repo_size)
	201	+ write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
	202	+ if hasattr(recommendation,"ranking"):
	203	+ results.add_result(recommendation.ranking,sample)
	204	+ with open(log_file,'w') as f:
	205	+ precision_10 = sum(results.precision[10])/len(results.precision[10])
	206	+ f1_10 = sum(results.f1[10])/len(results.f1[10])
	207	+ f05_10 = sum(results.f05[10])/len(results.f05[10])
	208	+ f.write("# %s\n# %s\n\ncoverage %d\n\n" %
	209	+ (label["description"],label["values"],recommendation.size))
	210	+ f.write("# best results (recommendation size; metric)\n")
	211	+ f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
	212	+ (results.best_precision()[0],results.best_precision()[1],
	213	+ results.best_f1()[0],results.best_f1()[1],
	214	+ results.best_f05()[0],results.best_f05()[1]))
	215	+ f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
	216	+ (precision_10,f1_10,f05_10))
	217	+ precision = results.get_precision_summary()
	218	+ recall = results.get_recall_summary()
	219	+ f1 = results.get_f1_summary()
	220	+ f05 = results.get_f05_summary()
	221	+ accuracy = results.get_accuracy_summary()
	222	+ plot_summary(precision,recall,f1,f05,accuracy,log_file)
	223	+
	224	+def run_content(user,cfg):
	225	+ for strategy in content_based:
	226	+ cfg.strategy = strategy
	227	+ for size in profile_size:
	228	+ cfg.profile_size = size
	229	+ run_strategy(cfg,user)
	230	+
	231	+def run_collaborative(user,cfg):
	232	+ popcon_desktopapps = cfg.popcon_desktopapps
	233	+ popcon_programs = cfg.popcon_programs
	234	+ for strategy in collaborative:
	235	+ cfg.strategy = strategy
	236	+ for k in neighbors:
	237	+ cfg.k_neighbors = k
	238	+ #for size in popcon_size:
	239	+ # if size:
	240	+ # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
	241	+ # cfg.popcon_programs = popcon_programs+"_"+size
	242	+ run_strategy(cfg,user)
	243	+
	244	+def run_hybrid(user,cfg):
	245	+ popcon_desktopapps = cfg.popcon_desktopapps
	246	+ popcon_programs = cfg.popcon_programs
	247	+ for strategy in hybrid:
	248	+ cfg.strategy = strategy
	249	+ for k in neighbors:
	250	+ cfg.k_neighbors = k
	251	+ #for size in popcon_size:
	252	+ # if size:
	253	+ # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
	254	+ # cfg.popcon_programs = popcon_programs+"_"+size
	255	+ for size in profile_size:
	256	+ cfg.profile_size = size
	257	+ run_strategy(cfg,user)
	258	+
	259	+if __name__ == '__main__':
	260	+ #user = LocalSystem()
	261	+ #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
	262	+
	263	+ cfg = Config()
	264	+ user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
	265	+ #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
	266	+ user.filter_pkg_profile(cfg.pkgs_filter)
	267	+ user.maximal_pkg_profile()
	268	+
	269	+ if "content" in sys.argv or len(sys.argv)<2:
	270	+ run_content(user,cfg)
	271	+ if "collaborative" in sys.argv or len(sys.argv)<2:
	272	+ run_collaborative(user,cfg)
	273	+ if "hybrid" in sys.argv or len(sys.argv)<2:
	274	+ run_hybrid(user,cfg)
	@@ -1,27 +0,0 @@	@@ -1,27 +0,0 @@
1	-[DEFAULT]
2	-repetitions = 1
3	-iterations = 10
4	-path = 'results'
5	-experiment = 'grid'
6	-weight = ['bm25', 'trad']
7	-;profile_size = range(10,100,10)
8	-;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
9	-sample = [0.6, 0.7, 0.8, 0.9]
10	-
11	-[content]
12	-strategy = ['cb','cbt','cbd']
13	-
14	-[clustering]
15	-experiment = 'single'
16	-;iterations = 4
17	-;medoids = range(2,6)
18	-iterations = 6
19	-medoids = [100,500,1000,5000,10000,50000]
20	-;disabled for this experiment
21	-weight = 0
22	-profile_size = 0
23	-sample = 0
24	-
25	-[colaborative]
26	-users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"]
27	-neighbors = range(10,1010,50)
@@ -0,0 +1,49 @@		@@ -0,0 +1,49 @@
	1	+#! /usr/bin/env python
	2	+"""
	3	+ sample-popcon - extract a sample from popcon population
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import xapian
	23	+import os
	24	+import random
	25	+import sys
	26	+
	27	+if __name__ == '__main__':
	28	+ try:
	29	+ sample_file = sys.argv[1]
	30	+ popcon = xapian.WritableDatabase(sys.argv[2],xapian.DB_OPEN)
	31	+ except:
	32	+ print "Usage: extract-sample-db sample_file popcon_index"
	33	+ exit(1)
	34	+ enquire = xapian.Enquire(popcon)
	35	+ print sample_file.split("/")
	36	+ new_popcon = xapian.WritableDatabase(sys.argv[2]+"-"+sample_file.split("/")[-1],xapian.DB_CREATE_OR_OVERWRITE)
	37	+ print ("Popcon repository size: %d" % popcon.get_doccount())
	38	+ for submission in open(sample_file):
	39	+ print "ID"+submission.strip()
	40	+ query = xapian.Query("ID"+submission.strip())
	41	+ enquire.set_query(query)
	42	+ mset = enquire.get_mset(0,20)
	43	+ for m in mset:
	44	+ print "Adding doc %s"%m.docid
	45	+ new_popcon.add_document(popcon.get_document(m.docid))
	46	+ print "Removing doc %s"%m.docid
	47	+ popcon.delete_document(m.docid)
	48	+ print ("Popcon repository size: %d" % popcon.get_doccount())
	49	+ print ("Popcon repository size: %d" % new_popcon.get_doccount())
@@ -0,0 +1,202 @@		@@ -0,0 +1,202 @@
	1	+#!/usr/bin/env python
	2	+"""
	3	+ hybrid-suite
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import sys
	23	+sys.path.insert(0,'../')
	24	+from config import Config
	25	+from data import PopconXapianIndex, PopconSubmission
	26	+from recommender import Recommender
	27	+from user import LocalSystem, User
	28	+from evaluation import *
	29	+import logging
	30	+import random
	31	+import Gnuplot
	32	+import numpy
	33	+
	34	+#hybrid_strategies = ['knnco','knnco_eset']
	35	+
	36	+if __name__ == '__main__':
	37	+ if len(sys.argv)<2:
	38	+ print "Usage: hybrid strategy sample_file"
	39	+ exit(1)
	40	+
	41	+ iterations = 20
	42	+ profile_size = [10,40,70,100,170,240]
	43	+ neighbor_size = [3,10,50,70,100,150,200]
	44	+
	45	+ #iterations = 1
	46	+ #profile_size = [10,20,30]
	47	+ #neighbor_size = [10,20,30]
	48	+
	49	+ cfg = Config()
	50	+ population_sample = []
	51	+ strategy = sys.argv[1]
	52	+ sample_file = sys.argv[2]
	53	+ sample_str = sample_file.split('/')[-1]
	54	+ with open(sample_file,'r') as f:
	55	+ for line in f.readlines():
	56	+ user_id = line.strip('\n')
	57	+ population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
	58	+ sample_dir = ("results/hybrid/%s/%s" % (sample_str,strategy))
	59	+ if not os.path.exists(sample_dir):
	60	+ os.makedirs(sample_dir)
	61	+
	62	+ cfg.strategy = strategy
	63	+ p_10_summary = {}
	64	+ f05_100_summary = {}
	65	+ c_10 = {}
	66	+ c_100 = {}
	67	+
	68	+ log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
	69	+ graph_10 = {}
	70	+ graph_100 = {}
	71	+ graph_10_jpg = {}
	72	+ graph_100_jpg = {}
	73	+ comment_10 = {}
	74	+ comment_100 = {}
	75	+ for k in neighbor_size:
	76	+ graph_10[k] = log_file+("-neighborhood%.3d-010.png"%k)
	77	+ graph_100[k] = log_file+("-neighborhood%.3d-100.png"%k)
	78	+ graph_10_jpg[k] = graph_10[k].strip(".png")+".jpg"
	79	+ graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
	80	+ comment_10[k] = graph_10_jpg[k]+".comment"
	81	+ comment_100[k] = graph_100_jpg[k]+".comment"
	82	+
	83	+ with open(comment_10[k],'w') as f:
	84	+ f.write("# %s\n" % sample_str)
	85	+ f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
	86	+ (cfg.strategy,iterations))
	87	+ f.write("# neighborhood\tprofile\tmean_p_10\tdev_p_10\tc_10\n\n")
	88	+ with open(comment_100[k],'w') as f:
	89	+ f.write("# %s\n" % sample_str)
	90	+ f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
	91	+ (cfg.strategy,iterations))
	92	+ f.write("# neighborhood\tprofile\tmean_f05_100\tdev_f05_100\tc_100\n\n")
	93	+
	94	+ c_10[k] = {}
	95	+ c_100[k] = {}
	96	+ p_10_summary[k] = {}
	97	+ f05_100_summary[k] = {}
	98	+ for size in profile_size:
	99	+ c_10[k][size] = set()
	100	+ c_100[k][size] = set()
	101	+ p_10_summary[k][size] = []
	102	+ f05_100_summary[k][size] = []
	103	+ with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'w') as f:
	104	+ f.write("# %s\n" % sample_str)
	105	+ f.write("# strategy %s-neighborhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
	106	+ f.write("# p_10\t\tf05_100\n\n")
	107	+
	108	+ # main loop per user
	109	+ for submission_file in population_sample:
	110	+ user = PopconSystem(submission_file)
	111	+ user.filter_pkg_profile(cfg.pkgs_filter)
	112	+ user.maximal_pkg_profile()
	113	+ for k in neighbor_size:
	114	+ cfg.k_neighbors = k
	115	+ for size in profile_size:
	116	+ cfg.profile_size = size
	117	+ rec = Recommender(cfg)
	118	+ repo_size = rec.items_repository.get_doccount()
	119	+ p_10 = []
	120	+ f05_100 = []
	121	+ for n in range(iterations):
	122	+ # Fill sample profile
	123	+ profile_len = len(user.pkg_profile)
	124	+ item_score = {}
	125	+ for pkg in user.pkg_profile:
	126	+ item_score[pkg] = user.item_score[pkg]
	127	+ sample = {}
	128	+ sample_size = int(profile_len*0.9)
	129	+ for i in range(sample_size):
	130	+ key = random.choice(item_score.keys())
	131	+ sample[key] = item_score.pop(key)
	132	+ iteration_user = User(item_score)
	133	+ recommendation = rec.get_recommendation(iteration_user,repo_size)
	134	+ if hasattr(recommendation,"ranking"):
	135	+ ranking = recommendation.ranking
	136	+ real = RecommendationResult(sample)
	137	+ predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
	138	+ evaluation = Evaluation(predicted_10,real,repo_size)
	139	+ p_10.append(evaluation.run(Precision()))
	140	+ predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
	141	+ evaluation = Evaluation(predicted_100,real,repo_size)
	142	+ f05_100.append(evaluation.run(F_score(0.5)))
	143	+ c_10[k][size] = c_10[k][size].union(recommendation.ranking[:10])
	144	+ c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
	145	+ # save summary
	146	+ if p_10:
	147	+ p_10_summary[k][size].append(numpy.mean(p_10))
	148	+ if f05_100:
	149	+ f05_100_summary[k][size].append(numpy.mean(f05_100))
	150	+
	151	+ with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'a') as f:
	152	+ f.write("%.4f\t\t%.4f\n" %
	153	+ (numpy.mean(p_10),numpy.mean(f05_100)))
	154	+
	155	+ # back to main flow
	156	+ coverage_10 = {}
	157	+ coverage_100 = {}
	158	+ for k in neighbor_size:
	159	+ coverage_10[k] = {}
	160	+ coverage_100[k] = {}
	161	+ with open(comment_10[k],'a') as f:
	162	+ for size in profile_size:
	163	+ coverage_10[k][size] = len(c_10[k][size])/float(repo_size)
	164	+ f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
	165	+ (k,size,numpy.mean(p_10_summary[k][size]),
	166	+ numpy.std(p_10_summary[k][size]),coverage_10[k][size]))
	167	+ with open(comment_100[k],'a') as f:
	168	+ for size in profile_size:
	169	+ coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
	170	+ f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
	171	+ (k,size,numpy.mean(f05_100_summary[k][size]),
	172	+ numpy.std(f05_100_summary[k][size]),coverage_100[k][size]))
	173	+
	174	+ for k in neighbor_size:
	175	+ # plot results summary
	176	+ g = Gnuplot.Gnuplot()
	177	+ g('set style data lines')
	178	+ g('set yrange [0:1.0]')
	179	+ g.xlabel('Profile size')
	180	+ g.title("Setup: %s-neighborhood%3d (threshold 10)" % (cfg.strategy,k))
	181	+ g.plot(Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
	182	+ for i in p_10_summary[k].keys()]),title="Precision"),
	183	+ Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
	184	+ for i in p_10_summary[k].keys()]),title="Deviation",
	185	+ with_="yerrorbar lt 2 pt 6"),
	186	+ Gnuplot.Data(sorted([[i,coverage_10[k][i]]
	187	+ for i in coverage_10[k].keys()]),title="Coverage"))
	188	+ g.hardcopy(graph_10[k],terminal="png")
	189	+
	190	+ g = Gnuplot.Gnuplot()
	191	+ g('set style data lines')
	192	+ g('set yrange [0:1.0]')
	193	+ g.xlabel('Profile size')
	194	+ g.title("Setup: %s-neighborhood%3d (threshold 100)" % (cfg.strategy,k))
	195	+ g.plot(Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
	196	+ for i in f05_100_summary[k].keys()]),title="F05"),
	197	+ Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
	198	+ for i in f05_100_summary[k].keys()]),title="Deviation",
	199	+ with_="yerrorbar lt 2 pt 6"),
	200	+ Gnuplot.Data(sorted([[i,coverage_100[k][i]]
	201	+ for i in coverage_100[k].keys()]),title="Coverage"))
	202	+ g.hardcopy(graph_100[k],terminal="png")