Updated experiments scripts

Tássia Camões Araújo
1 parent 4d01144b
Showing 6 changed files with 598 additions and 172 deletions Show diff stats
src/experiments/extract-sample-db.py
src/experiments/hybrid.py
src/experiments/k-suite.py
src/experiments/pure.py
src/experiments/roc-suite.py
src/experiments/sample-popcon-arch.py
@@ -0,0 +1,49 @@
+#! /usr/bin/env python
+"""
+    sample-popcon - extract a sample from popcon population
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import xapian
+import os
+import random
+import sys
+
+if __name__ == '__main__':
+    try:
+        sample_file = sys.argv[1]
+    	popcon = xapian.WritableDatabase(sys.argv[2],xapian.DB_OPEN)
+    except:
+        print "Usage: extract-sample-db sample_file popcon_index"
+        exit(1)
+    enquire = xapian.Enquire(popcon)
+    print sample_file.split("/")
+    new_popcon = xapian.WritableDatabase(sys.argv[2]+"-"+sample_file.split("/")[-1],xapian.DB_CREATE_OR_OVERWRITE)
+    print ("Popcon repository size: %d" % popcon.get_doccount())
+    for submission in open(sample_file):
+        print "ID"+submission.strip()
+        query = xapian.Query("ID"+submission.strip())
+        enquire.set_query(query)
+        mset = enquire.get_mset(0,20)
+        for m in mset:
+            print "Adding doc %s"%m.docid
+            new_popcon.add_document(popcon.get_document(m.docid))
+            print "Removing doc %s"%m.docid
+            popcon.delete_document(m.docid)
+    print ("Popcon repository size: %d" % popcon.get_doccount())
+    print ("Popcon repository size: %d" % new_popcon.get_doccount())
@@ -0,0 +1,197 @@
+#!/usr/bin/env python
+"""
+    hybrid-suite
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+
+if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: hybrid strategy sample_file"
+        exit(1)
+
+    iterations = 20
+    profile_size = [10,40,70,100,170,240]
+    neighbor_size = [3,10,50,100,200,400]
+
+    #hybrid_strategies = ['knnco','knnco_eset']
+
+    #iterations = 1
+    #profile_size = [10,20,30]
+    #neighbor_size = [10,20,30]
+
+    cfg = Config()
+    population_sample = []
+    strategy = sys.argv[1]
+    sample_file = sys.argv[2]
+    sample_str = sample_file.split('/')[-1]
+    with open(sample_file,'r') as f:
+        for line in f.readlines():
+            user_id = line.strip('\n')
+            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
+    sample_dir = ("results/hybrid/%s" % sample_str)
+    if not os.path.exists(sample_dir):
+        os.makedirs(sample_dir)
+
+    cfg.strategy = strategy
+    p_20_summary = {}
+    f05_100_summary = {}
+    c_20 = {}
+    c_100 = {}
+
+    log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
+    graph_20 = {}
+    graph_100 = {}
+    graph_20_jpg = {}
+    graph_100_jpg = {}
+    comment_20 = {}
+    comment_100 = {}
+    for k in neighbor_size:
+        graph_20[k] = log_file+("-neighboorhod%.3d-020.png"%k)
+        graph_100[k] = log_file+("-neighboorhod%.3d-100.png"%k)
+        graph_20_jpg[k] = graph_20[k].strip(".png")+".jpg"
+        graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
+        comment_20[k] = graph_20_jpg[k]+".comment"
+        comment_100[k] = graph_100_jpg[k]+".comment"
+
+        with open(comment_20[k],'w') as f:
+            f.write("# %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# neighboorhood\tprofile\tp_20\tc_20\n\n")
+        with open(comment_100[k],'w') as f:
+            f.write("# %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# neighboorhood\tprofile\tf05_100\tc_100\n\n")
+
+        c_20[k] = {}
+        c_100[k] = {}
+        p_20_summary[k] = {}
+        f05_100_summary[k] = {}
+        for size in profile_size:
+            c_20[k][size] = set()
+            c_100[k][size] = set()
+            p_20_summary[k][size] = []
+            f05_100_summary[k][size] = []
+            with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'w') as f:
+                f.write("# %s\n" % sample_str)
+                f.write("# strategy %s-neighboorhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
+                f.write("# p_20\t\tf05_100\n\n")
+
+    # main loop per user
+    for submission_file in population_sample:
+        user = PopconSystem(submission_file)
+        user.filter_pkg_profile(cfg.pkgs_filter)
+        user.maximal_pkg_profile()
+        for k in neighbor_size:
+            cfg.k_neighbors = k
+            for size in profile_size:
+                cfg.profile_size = size
+                rec = Recommender(cfg)
+                repo_size = rec.items_repository.get_doccount()
+                p_20 = []
+                f05_100 = []
+                for n in range(iterations):
+                    # Fill sample profile
+                    profile_len = len(user.pkg_profile)
+                    item_score = {}
+                    for pkg in user.pkg_profile:
+                        item_score[pkg] = user.item_score[pkg]
+                    sample = {}
+                    sample_size = int(profile_len*0.9)
+                    for i in range(sample_size):
+                         key = random.choice(item_score.keys())
+                         sample[key] = item_score.pop(key)
+                    iteration_user = User(item_score)
+                    recommendation = rec.get_recommendation(iteration_user,repo_size)
+                    if hasattr(recommendation,"ranking"):
+                        ranking = recommendation.ranking
+                        real = RecommendationResult(sample)
+                        predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
+                        evaluation = Evaluation(predicted_20,real,repo_size)
+                        p_20.append(evaluation.run(Precision()))
+                        predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
+                        evaluation = Evaluation(predicted_100,real,repo_size)
+                        f05_100.append(evaluation.run(F_score(0.5)))
+                        c_20[k][size] = c_20[k][size].union(recommendation.ranking[:20])
+                        c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
+                # save summary
+                if p_20:
+                    p_20_summary[k][size].append(sum(p_20)/len(p_20))
+                if f05_100:
+                    f05_100_summary[k][size].append(sum(f05_100)/len(f05_100))
+
+                with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'a') as f:
+                    f.write("%.4f\t\t%.4f\n" %
+                            ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
+
+    # back to main flow
+    coverage_20 = {}
+    coverage_100 = {}
+    for k in neighbor_size:
+        coverage_20[k] = {}
+        coverage_100[k] = {}
+        with open(comment_20[k],'a') as f:
+            for size in profile_size:
+                coverage_20[k][size] = len(c_20[k][size])/float(repo_size)
+                f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
+                        (k,size,float(sum(p_20_summary[k][size]))/len(p_20_summary[k][size]),coverage_20[k][size]))
+        with open(comment_100[k],'a') as f:
+            for size in profile_size:
+                coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
+                f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
+                        (k,size,float(sum(f05_100_summary[k][size]))/len(f05_100_summary[k][size]),coverage_100[k][size]))
+
+    for k in neighbor_size:
+        # plot results summary
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('Profile size')
+        g.title("Setup: %s-neighboorhood%3d (threshold 20)" % (cfg.strategy,k))
+        g.plot(Gnuplot.Data(sorted([[i,sum(p_20_summary[k][i])/len(p_20_summary[k][i])]
+                                    for i in p_20_summary[k].keys()]),title="Precision"),
+               Gnuplot.Data(sorted([[i,coverage_20[k][i]]
+                                    for i in coverage_20[k].keys()]),title="Coverage"))
+        g.hardcopy(graph_20[k],terminal="png")
+        #commands.getoutput("convert -quality 100 %s %s" %
+        #                   (graph_20[k],graph_20_jpg[k]))
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('Profile size')
+        g.title("Setup: %s-neighboorhood%3d (threshold 100)" % (cfg.strategy,k))
+        g.plot(Gnuplot.Data(sorted([[i,sum(f05_100_summary[k][i])/len(f05_100_summary[k][i])]
+                                    for i in f05_100_summary[k].keys()]),title="F05"),
+               Gnuplot.Data(sorted([[i,coverage_100[k][i]]
+                                    for i in coverage_100[k].keys()]),title="Coverage"))
+        g.hardcopy(graph_100[k],terminal="png")
+        #commands.getoutput("convert -quality 100 %s %s" %
+        #                   (graph_100[k],graph_100_jpg[k]))
 #!/usr/bin/env python
 """
-    recommender suite - recommender experiments suite 
+    k-suite - experiment different neighborhood sizes
 """
 __author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
 __copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
@@ -31,25 +31,38 @@ import random
 import Gnuplot
 import numpy
-def plot_roc(p,roc_points,log_file):
+def plot_roc(k,roc_points,log_file):
     g = Gnuplot.Gnuplot()
     g('set style data points')
     g.xlabel('False Positive Rate')
     g.ylabel('True Positive Rate')
     g('set xrange [0:1.0]')
     g('set yrange [0:1.0]')
-    g.title("Setup: %s" % log_file.split("/")[-1])
+    g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
     g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
-           Gnuplot.Data(roc_points,title="k %d"%k))
+           Gnuplot.Data(roc_points))
     g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
     g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
+def plot_summary(precision,f05,mcc,log_file):
+    g = Gnuplot.Gnuplot()
+    g('set style data lines')
+    g.xlabel('Neighborhood (k)')
+    g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
+    g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
+           Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
+           Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
+    g.hardcopy(log_file+(".png"),terminal="png")
+    g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
+
 class ExperimentResults:
     def __init__(self,repo_size):
         self.repository_size = repo_size
         self.precision = []
         self.recall = []
         self.fpr = []
+        self.f05 = []
+        self.mcc = []
     def add_result(self,ranking,sample):
         predicted = RecommendationResult(dict.fromkeys(ranking,1))
@@ -58,49 +71,72 @@ class ExperimentResults:
         self.precision.append(evaluation.run(Precision()))
         self.recall.append(evaluation.run(Recall()))
         self.fpr.append(evaluation.run(FPR()))
+        self.f05.append(evaluation.run(F_score(0.5)))
+        self.mcc.append(evaluation.run(MCC()))
-    # Average ROC by threshold (whici is the size)
     def get_roc_point(self):
         tpr = self.recall
         fpr = self.fpr
+        if not tpr or not fpr:
+            return [0,0]
         return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
     def get_precision_summary(self):
+        if not self.precision: return 0
         return  sum(self.precision)/len(self.precision)
-    def get_recall_summary(self):
-        return  sum(self.recall)/len(self.recall)
+    def get_f05_summary(self):
+        if not self.f05: return 0
+        return  sum(self.f05)/len(self.f05)
+
+    def get_mcc_summary(self):
+        if not self.mcc: return 0
+        return  sum(self.mcc)/len(self.mcc)
 if __name__ == '__main__':
-    # experiment parameters
+    if len(sys.argv)<3:
+        print "Usage: k-suite strategy_str sample_file"
+        exit(1)
     threshold = 20
     iterations = 30
-    sample_file = "results/misc-popcon/sample-050-100"
     neighbors = [3,5,10,50,100,150,200,300,400,500]
     cfg = Config()
-    cfg.strategy = "knn"
-    print cfg.popcon_index
-    sample = []
+    cfg.strategy = sys.argv[1]
+    sample_file = sys.argv[2]
+    population_sample = []
     with open(sample_file,'r') as f:
         for line in f.readlines():
             user_id = line.strip('\n')
-            sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
+            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
     # setup dictionaries and files
-    roc_points = {}
+    roc_summary = {}
     recommended = {}
-    precisions = {}
-    aucs = {}
-    log_file = "results/k-suite/sample-050-100/%s" % (cfg.strategy)
+    precision_summary = {}
+    f05_summary = {}
+    mcc_summary = {}
+    sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
+    if not os.path.exists(sample_dir):
+        os.makedirs(sample_dir)
+    log_file = os.path.join(sample_dir,cfg.strategy)
+    with open(log_file,'w') as f:
+        f.write("# %s\n\n" % sample_file.split('/')[-1])
+        f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
+                (cfg.strategy,threshold,iterations))
+        f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
+
     for k in neighbors:
-        roc_points[k] = []
+        roc_summary[k] = []
         recommended[k] = set()
-        precisions[k] = []
-        aucs[k] = []
+        precision_summary[k] = []
+        f05_summary[k] = []
+        mcc_summary[k] = []
         with open(log_file+"-k%.3d"%k,'w') as f:
+            f.write("# %s\n\n" % sample_file.split('/')[-1])
             f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
-            f.write("# roc_point \tp(20) \tauc\n\n") 
+            f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
+
     # main loop per user
-    for submission_file in sample:
+    for submission_file in population_sample:
         user = PopconSystem(submission_file)
         user.filter_pkg_profile(cfg.pkgs_filter)
         user.maximal_pkg_profile()
@@ -112,12 +148,12 @@ if __name__ == &#39;__main__&#39;:
             # n iterations for same recommender and user
             for n in range(iterations):
                 # Fill sample profile
-                profile_size = len(user.pkg_profile)
+                profile_len = len(user.pkg_profile)
                 item_score = {}
                 for pkg in user.pkg_profile:
                     item_score[pkg] = user.item_score[pkg]
                 sample = {}
-                sample_size = int(profile_size*0.9)
+                sample_size = int(profile_len*0.9)
                 for i in range(sample_size):
                      key = random.choice(item_score.keys())
                      sample[key] = item_score.pop(key)
@@ -125,28 +161,26 @@ if __name__ == &#39;__main__&#39;:
                 recommendation = rec.get_recommendation(iteration_user,threshold)
                 if hasattr(recommendation,"ranking"):
                     results.add_result(recommendation.ranking,sample)
-                    print "ranking",recommendation.ranking
-                    print "recommended_%d"%k,recommended[k]
                     recommended[k] = recommended[k].union(recommendation.ranking)
-                    print recommended[k]
             # save summary
             roc_point = results.get_roc_point()
-            auc = numpy.trapz(y=[0,roc_point[1],1],x=[0,roc_point[0],1])
-            p_20 = results.get_precision_summary()
-            roc_points[k].append(roc_point)
-            aucs[k].append(auc)
-            precisions[k].append(p_20)
+            roc_summary[k].append(roc_point)
+            precision = results.get_precision_summary()
+            precision_summary[k].append(precision)
+            f05 = results.get_f05_summary()
+            f05_summary[k].append(f05)
+            mcc = results.get_mcc_summary()
+            mcc_summary[k].append(mcc)
             with open(log_file+"-k%.3d"%k,'a') as f:
-                f.write("%s \t%.2f \t%.4f\n" % (str(roc_point),p_20,auc))
+                f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
+                        (roc_point[0],roc_point[1],precision,f05,mcc))
     # back to main flow
-    with open(log_file,'w') as f:
-        f.write("# k coverage \tp(20) \tauc\n\n")
+    with open(log_file,'a') as f:
+        plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
         for k in neighbors:
-            print "len_recommended_%d"%k,len(recommended[k])
-            print "repo_size",repo_size
-            coverage = len(recommended[k])/float(repo_size)
-            print coverage
-            f.write("%d \t%.2f \t%.2f \t%.2fi\n" %
-                    (k,coverage,float(sum(precisions[k]))/len(precisions[k]),
-                     float(sum(aucs[k]))/len(aucs[k])))
-            plot_roc(k,roc_points[k],log_file)
+            coverage = len(recommended[size])/float(repo_size)
+            f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
+                    (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
+                     float(sum(f05_summary[k]))/len(f05_summary[k]),
+                     float(sum(mcc_summary[k]))/len(mcc_summary[k])))
+            plot_roc(k,roc_summary[k],log_file)
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+"""
+    profile-suite - experiment different profile sizes
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+
+if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: profile-suite strategy_category sample_file"
+        exit(1)
+
+    iterations = 20
+    profile_size = [10,20,40,70,100,140,170,200,240]
+    neighbor_size = [3,5,10,50,100,150,200,300,400,500]
+
+    content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
+    collaborative_strategies = ['knn_eset']#,'knn_eset','knn_plus']
+    #collaborative_strategies = ['knn','knn_eset','knn_plus']
+
+    #iterations = 1
+    #profile_size = [10,20,30]
+    #neighbor_size = [10,20,30]
+    #content_strategies = ['cb']
+    #collaborative_strategies = ['knn_eset']
+
+    strategy_category = sys.argv[1]
+    if strategy_category == "content":
+        strategies = content_strategies
+        sizes = profile_size
+        option_str = "profile"
+    elif strategy_category == "collaborative":
+        strategies = collaborative_strategies
+        sizes = neighbor_size
+        option_str = "neighborhood"
+    else:
+        print "Usage: profile-suite strategy_category sample_file"
+        exit(1)
+
+    cfg = Config()
+    population_sample = []
+    sample_file = sys.argv[2]
+    sample_str = sample_file.split('/')[-1]
+    with open(sample_file,'r') as f:
+        for line in f.readlines():
+            user_id = line.strip('\n')
+            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
+    sample_dir = ("results/%s/%s" %
+                  (strategy_category,sample_str))
+    if not os.path.exists(sample_dir):
+        os.makedirs(sample_dir)
+
+    for strategy in strategies:
+        cfg.strategy = strategy
+        p_20_summary = {}
+        f05_100_summary = {}
+        c_20 = {}
+        c_100 = {}
+
+        log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
+        graph_20 = log_file+"-20.png"
+        graph_100 = log_file+"-100.png"
+        graph_20_jpg = graph_20.strip(".png")+".jpg"
+        graph_100_jpg = graph_100.strip(".png")+".jpg"
+        comment_20 = graph_20_jpg+".comment"
+        comment_100 = graph_100_jpg+".comment"
+
+        with open(comment_20,'w') as f:
+            f.write("# sample %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# %s\tp_20\tc_20\n\n"%option_str)
+        with open(comment_100,'w') as f:
+            f.write("# sample %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# %s\t\tf05_100\t\tc_100\n\n"%option_str)
+
+        for size in sizes:
+            c_20[size] = set()
+            c_100[size] = set()
+            p_20_summary[size] = []
+            f05_100_summary[size] = []
+            with open(log_file+"-%s%.3d"%(option_str,size),'w') as f:
+                f.write("# sample %s\n" % sample_str)
+                f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size))
+                f.write("# p_20\tf05_100\n\n")
+
+        # main loop per user
+        for submission_file in population_sample:
+            user = PopconSystem(submission_file)
+            user.filter_pkg_profile(cfg.pkgs_filter)
+            user.maximal_pkg_profile()
+            for size in sizes:
+                cfg.profile_size = size
+                cfg.k_neighbors = size
+                rec = Recommender(cfg)
+                repo_size = rec.items_repository.get_doccount()
+                p_20 = []
+                f05_100 = []
+                for n in range(iterations):
+                    # Fill sample profile
+                    profile_len = len(user.pkg_profile)
+                    item_score = {}
+                    for pkg in user.pkg_profile:
+                        item_score[pkg] = user.item_score[pkg]
+                    sample = {}
+                    sample_size = int(profile_len*0.9)
+                    for i in range(sample_size):
+                         key = random.choice(item_score.keys())
+                         sample[key] = item_score.pop(key)
+                    iteration_user = User(item_score)
+                    recommendation = rec.get_recommendation(iteration_user,repo_size)
+                    if hasattr(recommendation,"ranking"):
+                        ranking = recommendation.ranking
+                        real = RecommendationResult(sample)
+                        predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
+                        evaluation = Evaluation(predicted_20,real,repo_size)
+                        p_20.append(evaluation.run(Precision()))
+                        predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
+                        evaluation = Evaluation(predicted_100,real,repo_size)
+                        f05_100.append(evaluation.run(F_score(0.5)))
+                        c_20[size] = c_20[size].union(recommendation.ranking[:20])
+                        c_100[size] = c_100[size].union(recommendation.ranking[:100])
+                # save summary
+                if p_20:
+                    p_20_summary[size].append(sum(p_20)/len(p_20))
+                if f05_100:
+                    f05_100_summary[size].append(sum(f05_100)/len(f05_100))
+
+                with open(log_file+"-%s%.3d"%(option_str,size),'a') as f:
+                    f.write("%.4f \t%.4f\n" %
+                            ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
+
+        # back to main flow
+        coverage_20 = {}
+        coverage_100 = {}
+        with open(comment_20,'a') as f:
+            for size in sizes:
+                coverage_20[size] = len(c_20[size])/float(repo_size)
+                f.write("%3d\t\t%.4f\t\t%.4f\n" %
+                        (size,float(sum(p_20_summary[size]))/len(p_20_summary[size]),coverage_20[size]))
+        with open(comment_100,'a') as f:
+            for size in sizes:
+                coverage_100[size] = len(c_100[size])/float(repo_size)
+                f.write("%3d\t\t%.4f\t\t%.4f\n" %
+                        (size,float(sum(f05_100_summary[size]))/len(f05_100_summary[size]),coverage_100[size]))
+
+        # plot results summary
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('%s size'%option_str.capitalize())
+        g.title("Setup: %s (threshold 20)" % cfg.strategy)
+        g.plot(Gnuplot.Data(sorted([[k,sum(p_20_summary[k])/len(p_20_summary[k])]
+                                    for k in p_20_summary.keys()]),title="Precision"),
+               Gnuplot.Data(sorted([[k,coverage_20[k]]
+                                    for k in coverage_20.keys()]),title="Coverage"))
+        g.hardcopy(graph_20,terminal="png")
+        commands.getoutput("convert -quality 20 %s %s" %
+                           (graph_100,graph_20_jpg))
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('%s size'%option_str.capitalize())
+        g.title("Setup: %s (threshold 100)" % cfg.strategy)
+        g.plot(Gnuplot.Data(sorted([[k,sum(f05_100_summary[k])/len(f05_100_summary[k])]
+                                    for k in f05_100_summary.keys()]),title="F05"),
+               Gnuplot.Data(sorted([[k,coverage_100[k]]
+                                    for k in coverage_100.keys()]),title="Coverage"))
+        g.hardcopy(graph_100,terminal="png")
+        commands.getoutput("convert -quality 100 %s %s" %
+                           (graph_100,graph_100_jpg))
@@ -43,11 +43,11 @@ import numpy
 iterations = 30
 sample_proportions = [0.9]
-weighting = [('bm25',1.0),('bm25',1.2),('bm25',2.0),('trad',0)]
+weighting = [('bm25',1.0)]
 content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
 collaborative = ['knn_eset','knn','knn_plus']
 hybrid = ['knnco','knnco_eset']
-profile_size = range(20,200,20)
+profile_size = range(20,200,40)
 neighbors = range(10,510,50)
 def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
@@ -73,7 +73,7 @@ def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_fi
                 output.write(pkg+"\n")
     output.close()
-def plot_roc(roc_points,auc,eauc,c,p,log_file):
+def plot_roc(roc_points,eauc,c,p,log_file):
     g = Gnuplot.Gnuplot()
     g('set style data lines')
     g.xlabel('False Positive Rate')
@@ -83,52 +83,27 @@ def plot_roc(roc_points,auc,eauc,c,p,log_file):
     g.title("Setup: %s" % log_file.split("/")[-1])
     g('set label "C %.2f" at 0.8,0.25' % c)
     g('set label "P(20) %.2f" at 0.8,0.2' % p)
-    g('set label "AUC %.4f" at 0.8,0.15' % auc)
-    g('set label "EAUC %.4f" at 0.8,0.1' % eauc)
+    g('set label "AUC %.4f" at 0.8,0.15' % eauc)
     g.plot(Gnuplot.Data(roc_points,title="ROC"),
-           Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
-           Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
+           Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
+           #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
     g.hardcopy(log_file+"-roc.png",terminal="png")
     g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
-def plot_summary(precision,recall,f1,f05,accuracy,log_file):
-    # Plot metrics summary
-    g = Gnuplot.Gnuplot()
-    g('set style data lines')
-    g.xlabel('Recommendation size')
-    g.title("Setup: %s" % log_file.split("/")[-1])
-    g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
-           Gnuplot.Data(precision,title="Precision"),
-           Gnuplot.Data(recall,title="Recall"),
-           Gnuplot.Data(f1,title="F_1"),
-           Gnuplot.Data(f05,title="F_0.5"))
-    g.hardcopy(log_file+".png",terminal="png")
-    g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
-    g('set logscale x')
-    g('replot')
-    g.hardcopy(log_file+"-logscale.png",terminal="png")
-    g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
-
 def get_label(cfg,sample_proportion):
     label = {}
     if cfg.strategy in content_based:
-        label["description"] = "strategy-filter-profile-k1_bm25"
-        label["values"] = ("%s-profile%.3d-%s-kbm%.1f" %
-                           (cfg.strategy,cfg.profile_size,
-                            cfg.pkgs_filter.split("/")[-1],
-                            cfg.bm25_k1))
+        label["description"] = "strategy-profile"
+        label["values"] = ("%s-profile%.3d" %
+                           (cfg.strategy,cfg.profile_size))
     elif cfg.strategy in collaborative:
-       label["description"] = "strategy-knn-filter-k1_bm25"
-       label["values"] = ("%s-k%.3d-%s-kbm%.1f" %
-                          (cfg.strategy,cfg.k_neighbors,
-                           cfg.pkgs_filter.split("/")[-1],
-                           cfg.bm25_k1))
+       label["description"] = "strategy-knn"
+       label["values"] = ("%s-k%.3d" %
+                          (cfg.strategy,cfg.k_neighbors))
     elif cfg.strategy in hybrid:
-       label["description"] = "strategy-knn-filter-profile-k1_bm25"
-       label["values"] = ("%s-k%.3d-profile%.3d-%s-kbm%.1f" %
-                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
-                           cfg.pkgs_filter.split("/")[-1],
-                           cfg.bm25_k1))
+       label["description"] = "strategy-knn-profile"
+       label["values"] = ("%s-k%.3d-profile%.3d" %
+                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
     else:
         print "Unknown strategy"
     return label
@@ -136,41 +111,28 @@ def get_label(cfg,sample_proportion):
 class ExperimentResults:
     def __init__(self,repo_size):
         self.repository_size = repo_size
-        self.accuracy = {}
         self.precision = {}
         self.recall = {}
-        self.f1 = {}
-        self.f05 = {}
         self.fpr = {}
-        #points = [1]+range(10,200,10)+range(200,self.repository_size,100)
         points = [1]+range(10,self.repository_size,10)
         self.recommended = set()
         for size in points:
-            self.accuracy[size] = []
             self.precision[size] = []
             self.recall[size] = []
-            self.f1[size] = []
-            self.f05[size] = []
             self.fpr[size] = []
     def add_result(self,ranking,sample):
-        print "len_recommended", len(self.recommended)
-        print "len_rank", len(ranking)
         self.recommended = self.recommended.union(ranking)
-        print "len_recommended", len(self.recommended)
         # get data only for point
-        for size in self.accuracy.keys():
+        for size in self.precision.keys():
             predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
             real = RecommendationResult(sample)
             evaluation = Evaluation(predicted,real,self.repository_size)
-            #self.accuracy[size].append(evaluation.run(Accuracy()))
             self.precision[size].append(evaluation.run(Precision()))
             self.recall[size].append(evaluation.run(Recall()))
-            #self.f1[size].append(evaluation.run(F_score(1)))
-            #self.f05[size].append(evaluation.run(F_score(0.5)))
             self.fpr[size].append(evaluation.run(FPR()))
-    # Average ROC by threshold (whici is the size)
+    # Average ROC by threshold (= size of recommendation)
     def get_roc_points(self):
         points = []
         for size in self.recall.keys():
@@ -179,38 +141,6 @@ class ExperimentResults:
             points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
         return sorted(points)
-    def get_precision_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
-        return sorted(summary)
-
-    def get_recall_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
-        return sorted(summary)
-
-    def get_f1_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
-        return sorted(summary)
-
-    def get_f05_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
-        return sorted(summary)
-
-    def get_accuracy_summary(self):
-        summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
-        return sorted(summary)
-
-    def best_precision(self):
-        size = max(self.precision, key = lambda x: max(self.precision[x]) and x>10)
-        return (size,max(self.precision[size]))
-
-    def best_f1(self):
-        size = max(self.f1, key = lambda x: max(self.f1[x]))
-        return (size,max(self.f1[size]))
-
-    def best_f05(self):
-        size = max(self.f05, key = lambda x: max(self.f05[x]))
-        return (size,max(self.f05[size]))
-
 def run_strategy(cfg,user):
     for weight in weighting:
         cfg.weight = weight[0]
@@ -220,22 +150,24 @@ def run_strategy(cfg,user):
         for proportion in sample_proportions:
             results = ExperimentResults(repo_size)
             label = get_label(cfg,proportion)
-            #log_file = "results/20110906/4a67a295/"+label["values"]
-            log_file = "results/"+label["values"]
+            user_dir = ("results/roc-suite/%s" % user.user_id[:8])
+            if not os.path.exists(user_dir):
+                os.mkdir(user_dir)
+            log_file = os.path.join(user_dir,label["values"])
             for n in range(iterations):
                 # Fill sample profile
-                profile_size = len(user.pkg_profile)
+                profile_len = len(user.pkg_profile)
                 item_score = {}
                 for pkg in user.pkg_profile:
                     item_score[pkg] = user.item_score[pkg]
                 sample = {}
-                sample_size = int(profile_size*proportion)
+                sample_size = int(profile_len*proportion)
                 for i in range(sample_size):
                      key = random.choice(item_score.keys())
                      sample[key] = item_score.pop(key)
                 iteration_user = User(item_score)
                 recommendation = rec.get_recommendation(iteration_user,repo_size)
-                #write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
+                write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
                 if hasattr(recommendation,"ranking"):
                     results.add_result(recommendation.ranking,sample)
             with open(log_file,'w') as f:
@@ -247,32 +179,12 @@ def run_strategy(cfg,user):
                         numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
                         numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
                 precision_20 = sum(results.precision[10])/len(results.precision[10])
-                print results.recommended
-                print "len",len(results.recommended)
                 coverage = len(results.recommended)/float(repo_size)
-                print "repo_size: ", float(repo_size)
-                print coverage
-                exit(1)
-                #f1_10 = sum(results.f1[10])/len(results.f1[10])
-                #f05_10 = sum(results.f05[10])/len(results.f05[10])
                 f.write("# %s\n# %s\n\n" %
                         (label["description"],label["values"]))
                 f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
                         (coverage,precision_20,auc,eauc))
-                #f.write("# best results (recommendation size; metric)\n")
-                #f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
-                #        (results.best_precision()[0],results.best_precision()[1],
-                #         results.best_f1()[0],results.best_f1()[1],
-                #         results.best_f05()[0],results.best_f05()[1]))
-                #f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
-                #        (precision_10,f1_10,f05_10))
-            #precision = results.get_precision_summary()
-            #recall = results.get_recall_summary()
-            #f1 = results.get_f1_summary()
-            #f05 = results.get_f05_summary()
-            #accuracy = results.get_accuracy_summary()
-            #plot_summary(precision,recall,f1,f05,accuracy,log_file)
-            plot_roc(roc_points,auc,eauc,coverage,precision_20,log_file)
+            plot_roc(roc_points,eauc,coverage,precision_20,log_file)
 def run_content(user,cfg):
     for strategy in content_based:
@@ -288,10 +200,6 @@ def run_collaborative(user,cfg):
         cfg.strategy = strategy
         for k in neighbors:
             cfg.k_neighbors = k
-            #for size in popcon_size:
-            #    if size:
-            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
-            #        cfg.popcon_programs = popcon_programs+"_"+size
             run_strategy(cfg,user)
 def run_hybrid(user,cfg):
@@ -301,28 +209,23 @@ def run_hybrid(user,cfg):
         cfg.strategy = strategy
         for k in neighbors:
             cfg.k_neighbors = k
-            #for size in popcon_size:
-            #    if size:
-            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
-            #        cfg.popcon_programs = popcon_programs+"_"+size
             for size in profile_size:
                 cfg.profile_size = size
                 run_strategy(cfg,user)
 if __name__ == '__main__':
-    #user = LocalSystem()
-    #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
+    if len(sys.argv)<2:
+        print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]"
+        exit(1)
     cfg = Config()
-    #user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
-    user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
-    #user =  PopconSystem("/root/.app-recommender/popcon-entries/4a/4a5834eb2aba6b6f17312239e0761c70")
+    user = PopconSystem(sys.argv[1])
     user.filter_pkg_profile(cfg.pkgs_filter)
     user.maximal_pkg_profile()
-    if "content" in sys.argv or len(sys.argv)<2:
+    if "content" in sys.argv or len(sys.argv)<3:
         run_content(user,cfg)
-    if "collaborative" in sys.argv or len(sys.argv)<2:
+    if "collaborative" in sys.argv or len(sys.argv)<3:
         run_collaborative(user,cfg)
-    if "hybrid" in sys.argv or len(sys.argv)<2:
+    if "hybrid" in sys.argv or len(sys.argv)<3:
         run_hybrid(user,cfg)
@@ -0,0 +1,44 @@
+#! /usr/bin/env python
+"""
+    sample-popcon-arch - extract a sample of a specific arch
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+import sys
+sys.path.insert(0,'../')
+import xapian
+import os
+import random
+import sys
+from user import RandomPopcon
+
+if __name__ == '__main__':
+    try:
+        size = int(sys.argv[1])
+        arch = sys.argv[2]
+        popcon_dir = sys.argv[3]
+        pkgs_filter = sys.argv[4]
+    except:
+        print "Usage: sample-popcon-arch size arch popcon_dir pkgs_filter"
+        exit(1)
+
+    sample_file = ("results/misc-popcon/sample-%s-%d" % (arch,size))
+    with open(sample_file,'w') as f:
+        for n in range(1,size+1):
+            user = RandomPopcon(popcon_dir,arch,pkgs_filter)
+            f.write(user.user_id+'\n')
+            print "sample",n
@@ -0,0 +1,49 @@		@@ -0,0 +1,49 @@
	1	+#! /usr/bin/env python
	2	+"""
	3	+ sample-popcon - extract a sample from popcon population
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import xapian
	23	+import os
	24	+import random
	25	+import sys
	26	+
	27	+if __name__ == '__main__':
	28	+ try:
	29	+ sample_file = sys.argv[1]
	30	+ popcon = xapian.WritableDatabase(sys.argv[2],xapian.DB_OPEN)
	31	+ except:
	32	+ print "Usage: extract-sample-db sample_file popcon_index"
	33	+ exit(1)
	34	+ enquire = xapian.Enquire(popcon)
	35	+ print sample_file.split("/")
	36	+ new_popcon = xapian.WritableDatabase(sys.argv[2]+"-"+sample_file.split("/")[-1],xapian.DB_CREATE_OR_OVERWRITE)
	37	+ print ("Popcon repository size: %d" % popcon.get_doccount())
	38	+ for submission in open(sample_file):
	39	+ print "ID"+submission.strip()
	40	+ query = xapian.Query("ID"+submission.strip())
	41	+ enquire.set_query(query)
	42	+ mset = enquire.get_mset(0,20)
	43	+ for m in mset:
	44	+ print "Adding doc %s"%m.docid
	45	+ new_popcon.add_document(popcon.get_document(m.docid))
	46	+ print "Removing doc %s"%m.docid
	47	+ popcon.delete_document(m.docid)
	48	+ print ("Popcon repository size: %d" % popcon.get_doccount())
	49	+ print ("Popcon repository size: %d" % new_popcon.get_doccount())
@@ -0,0 +1,197 @@		@@ -0,0 +1,197 @@
	1	+#!/usr/bin/env python
	2	+"""
	3	+ hybrid-suite
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import sys
	23	+sys.path.insert(0,'../')
	24	+from config import Config
	25	+from data import PopconXapianIndex, PopconSubmission
	26	+from recommender import Recommender
	27	+from user import LocalSystem, User
	28	+from evaluation import *
	29	+import logging
	30	+import random
	31	+import Gnuplot
	32	+import numpy
	33	+
	34	+if __name__ == '__main__':
	35	+ if len(sys.argv)<2:
	36	+ print "Usage: hybrid strategy sample_file"
	37	+ exit(1)
	38	+
	39	+ iterations = 20
	40	+ profile_size = [10,40,70,100,170,240]
	41	+ neighbor_size = [3,10,50,100,200,400]
	42	+
	43	+ #hybrid_strategies = ['knnco','knnco_eset']
	44	+
	45	+ #iterations = 1
	46	+ #profile_size = [10,20,30]
	47	+ #neighbor_size = [10,20,30]
	48	+
	49	+ cfg = Config()
	50	+ population_sample = []
	51	+ strategy = sys.argv[1]
	52	+ sample_file = sys.argv[2]
	53	+ sample_str = sample_file.split('/')[-1]
	54	+ with open(sample_file,'r') as f:
	55	+ for line in f.readlines():
	56	+ user_id = line.strip('\n')
	57	+ population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
	58	+ sample_dir = ("results/hybrid/%s" % sample_str)
	59	+ if not os.path.exists(sample_dir):
	60	+ os.makedirs(sample_dir)
	61	+
	62	+ cfg.strategy = strategy
	63	+ p_20_summary = {}
	64	+ f05_100_summary = {}
	65	+ c_20 = {}
	66	+ c_100 = {}
	67	+
	68	+ log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
	69	+ graph_20 = {}
	70	+ graph_100 = {}
	71	+ graph_20_jpg = {}
	72	+ graph_100_jpg = {}
	73	+ comment_20 = {}
	74	+ comment_100 = {}
	75	+ for k in neighbor_size:
	76	+ graph_20[k] = log_file+("-neighboorhod%.3d-020.png"%k)
	77	+ graph_100[k] = log_file+("-neighboorhod%.3d-100.png"%k)
	78	+ graph_20_jpg[k] = graph_20[k].strip(".png")+".jpg"
	79	+ graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
	80	+ comment_20[k] = graph_20_jpg[k]+".comment"
	81	+ comment_100[k] = graph_100_jpg[k]+".comment"
	82	+
	83	+ with open(comment_20[k],'w') as f:
	84	+ f.write("# %s\n" % sample_str)
	85	+ f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
	86	+ (cfg.strategy,iterations))
	87	+ f.write("# neighboorhood\tprofile\tp_20\tc_20\n\n")
	88	+ with open(comment_100[k],'w') as f:
	89	+ f.write("# %s\n" % sample_str)
	90	+ f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
	91	+ (cfg.strategy,iterations))
	92	+ f.write("# neighboorhood\tprofile\tf05_100\tc_100\n\n")
	93	+
	94	+ c_20[k] = {}
	95	+ c_100[k] = {}
	96	+ p_20_summary[k] = {}
	97	+ f05_100_summary[k] = {}
	98	+ for size in profile_size:
	99	+ c_20[k][size] = set()
	100	+ c_100[k][size] = set()
	101	+ p_20_summary[k][size] = []
	102	+ f05_100_summary[k][size] = []
	103	+ with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'w') as f:
	104	+ f.write("# %s\n" % sample_str)
	105	+ f.write("# strategy %s-neighboorhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
	106	+ f.write("# p_20\t\tf05_100\n\n")
	107	+
	108	+ # main loop per user
	109	+ for submission_file in population_sample:
	110	+ user = PopconSystem(submission_file)
	111	+ user.filter_pkg_profile(cfg.pkgs_filter)
	112	+ user.maximal_pkg_profile()
	113	+ for k in neighbor_size:
	114	+ cfg.k_neighbors = k
	115	+ for size in profile_size:
	116	+ cfg.profile_size = size
	117	+ rec = Recommender(cfg)
	118	+ repo_size = rec.items_repository.get_doccount()
	119	+ p_20 = []
	120	+ f05_100 = []
	121	+ for n in range(iterations):
	122	+ # Fill sample profile
	123	+ profile_len = len(user.pkg_profile)
	124	+ item_score = {}
	125	+ for pkg in user.pkg_profile:
	126	+ item_score[pkg] = user.item_score[pkg]
	127	+ sample = {}
	128	+ sample_size = int(profile_len*0.9)
	129	+ for i in range(sample_size):
	130	+ key = random.choice(item_score.keys())
	131	+ sample[key] = item_score.pop(key)
	132	+ iteration_user = User(item_score)
	133	+ recommendation = rec.get_recommendation(iteration_user,repo_size)
	134	+ if hasattr(recommendation,"ranking"):
	135	+ ranking = recommendation.ranking
	136	+ real = RecommendationResult(sample)
	137	+ predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
	138	+ evaluation = Evaluation(predicted_20,real,repo_size)
	139	+ p_20.append(evaluation.run(Precision()))
	140	+ predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
	141	+ evaluation = Evaluation(predicted_100,real,repo_size)
	142	+ f05_100.append(evaluation.run(F_score(0.5)))
	143	+ c_20[k][size] = c_20[k][size].union(recommendation.ranking[:20])
	144	+ c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
	145	+ # save summary
	146	+ if p_20:
	147	+ p_20_summary[k][size].append(sum(p_20)/len(p_20))
	148	+ if f05_100:
	149	+ f05_100_summary[k][size].append(sum(f05_100)/len(f05_100))
	150	+
	151	+ with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'a') as f:
	152	+ f.write("%.4f\t\t%.4f\n" %
	153	+ ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
	154	+
	155	+ # back to main flow
	156	+ coverage_20 = {}
	157	+ coverage_100 = {}
	158	+ for k in neighbor_size:
	159	+ coverage_20[k] = {}
	160	+ coverage_100[k] = {}
	161	+ with open(comment_20[k],'a') as f:
	162	+ for size in profile_size:
	163	+ coverage_20[k][size] = len(c_20[k][size])/float(repo_size)
	164	+ f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
	165	+ (k,size,float(sum(p_20_summary[k][size]))/len(p_20_summary[k][size]),coverage_20[k][size]))
	166	+ with open(comment_100[k],'a') as f:
	167	+ for size in profile_size:
	168	+ coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
	169	+ f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
	170	+ (k,size,float(sum(f05_100_summary[k][size]))/len(f05_100_summary[k][size]),coverage_100[k][size]))
	171	+
	172	+ for k in neighbor_size:
	173	+ # plot results summary
	174	+ g = Gnuplot.Gnuplot()
	175	+ g('set style data lines')
	176	+ g('set yrange [0:1.0]')
	177	+ g.xlabel('Profile size')
	178	+ g.title("Setup: %s-neighboorhood%3d (threshold 20)" % (cfg.strategy,k))
	179	+ g.plot(Gnuplot.Data(sorted([[i,sum(p_20_summary[k][i])/len(p_20_summary[k][i])]
	180	+ for i in p_20_summary[k].keys()]),title="Precision"),
	181	+ Gnuplot.Data(sorted([[i,coverage_20[k][i]]
	182	+ for i in coverage_20[k].keys()]),title="Coverage"))
	183	+ g.hardcopy(graph_20[k],terminal="png")
	184	+ #commands.getoutput("convert -quality 100 %s %s" %
	185	+ # (graph_20[k],graph_20_jpg[k]))
	186	+ g = Gnuplot.Gnuplot()
	187	+ g('set style data lines')
	188	+ g('set yrange [0:1.0]')
	189	+ g.xlabel('Profile size')
	190	+ g.title("Setup: %s-neighboorhood%3d (threshold 100)" % (cfg.strategy,k))
	191	+ g.plot(Gnuplot.Data(sorted([[i,sum(f05_100_summary[k][i])/len(f05_100_summary[k][i])]
	192	+ for i in f05_100_summary[k].keys()]),title="F05"),
	193	+ Gnuplot.Data(sorted([[i,coverage_100[k][i]]
	194	+ for i in coverage_100[k].keys()]),title="Coverage"))
	195	+ g.hardcopy(graph_100[k],terminal="png")
	196	+ #commands.getoutput("convert -quality 100 %s %s" %
	197	+ # (graph_100[k],graph_100_jpg[k]))
@@ -0,0 +1,199 @@		@@ -0,0 +1,199 @@
	1	+#!/usr/bin/env python
	2	+"""
	3	+ profile-suite - experiment different profile sizes
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import sys
	23	+sys.path.insert(0,'../')
	24	+from config import Config
	25	+from data import PopconXapianIndex, PopconSubmission
	26	+from recommender import Recommender
	27	+from user import LocalSystem, User
	28	+from evaluation import *
	29	+import logging
	30	+import random
	31	+import Gnuplot
	32	+import numpy
	33	+
	34	+if __name__ == '__main__':
	35	+ if len(sys.argv)<2:
	36	+ print "Usage: profile-suite strategy_category sample_file"
	37	+ exit(1)
	38	+
	39	+ iterations = 20
	40	+ profile_size = [10,20,40,70,100,140,170,200,240]
	41	+ neighbor_size = [3,5,10,50,100,150,200,300,400,500]
	42	+
	43	+ content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
	44	+ collaborative_strategies = ['knn_eset']#,'knn_eset','knn_plus']
	45	+ #collaborative_strategies = ['knn','knn_eset','knn_plus']
	46	+
	47	+ #iterations = 1
	48	+ #profile_size = [10,20,30]
	49	+ #neighbor_size = [10,20,30]
	50	+ #content_strategies = ['cb']
	51	+ #collaborative_strategies = ['knn_eset']
	52	+
	53	+ strategy_category = sys.argv[1]
	54	+ if strategy_category == "content":
	55	+ strategies = content_strategies
	56	+ sizes = profile_size
	57	+ option_str = "profile"
	58	+ elif strategy_category == "collaborative":
	59	+ strategies = collaborative_strategies
	60	+ sizes = neighbor_size
	61	+ option_str = "neighborhood"
	62	+ else:
	63	+ print "Usage: profile-suite strategy_category sample_file"
	64	+ exit(1)
	65	+
	66	+ cfg = Config()
	67	+ population_sample = []
	68	+ sample_file = sys.argv[2]
	69	+ sample_str = sample_file.split('/')[-1]
	70	+ with open(sample_file,'r') as f:
	71	+ for line in f.readlines():
	72	+ user_id = line.strip('\n')
	73	+ population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
	74	+ sample_dir = ("results/%s/%s" %
	75	+ (strategy_category,sample_str))
	76	+ if not os.path.exists(sample_dir):
	77	+ os.makedirs(sample_dir)
	78	+
	79	+ for strategy in strategies:
	80	+ cfg.strategy = strategy
	81	+ p_20_summary = {}
	82	+ f05_100_summary = {}
	83	+ c_20 = {}
	84	+ c_100 = {}
	85	+
	86	+ log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
	87	+ graph_20 = log_file+"-20.png"
	88	+ graph_100 = log_file+"-100.png"
	89	+ graph_20_jpg = graph_20.strip(".png")+".jpg"
	90	+ graph_100_jpg = graph_100.strip(".png")+".jpg"
	91	+ comment_20 = graph_20_jpg+".comment"
	92	+ comment_100 = graph_100_jpg+".comment"
	93	+
	94	+ with open(comment_20,'w') as f:
	95	+ f.write("# sample %s\n" % sample_str)
	96	+ f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
	97	+ (cfg.strategy,iterations))
	98	+ f.write("# %s\tp_20\tc_20\n\n"%option_str)
	99	+ with open(comment_100,'w') as f:
	100	+ f.write("# sample %s\n" % sample_str)
	101	+ f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
	102	+ (cfg.strategy,iterations))
	103	+ f.write("# %s\t\tf05_100\t\tc_100\n\n"%option_str)
	104	+
	105	+ for size in sizes:
	106	+ c_20[size] = set()
	107	+ c_100[size] = set()
	108	+ p_20_summary[size] = []
	109	+ f05_100_summary[size] = []
	110	+ with open(log_file+"-%s%.3d"%(option_str,size),'w') as f:
	111	+ f.write("# sample %s\n" % sample_str)
	112	+ f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size))
	113	+ f.write("# p_20\tf05_100\n\n")
	114	+
	115	+ # main loop per user
	116	+ for submission_file in population_sample:
	117	+ user = PopconSystem(submission_file)
	118	+ user.filter_pkg_profile(cfg.pkgs_filter)
	119	+ user.maximal_pkg_profile()
	120	+ for size in sizes:
	121	+ cfg.profile_size = size
	122	+ cfg.k_neighbors = size
	123	+ rec = Recommender(cfg)
	124	+ repo_size = rec.items_repository.get_doccount()
	125	+ p_20 = []
	126	+ f05_100 = []
	127	+ for n in range(iterations):
	128	+ # Fill sample profile
	129	+ profile_len = len(user.pkg_profile)
	130	+ item_score = {}
	131	+ for pkg in user.pkg_profile:
	132	+ item_score[pkg] = user.item_score[pkg]
	133	+ sample = {}
	134	+ sample_size = int(profile_len*0.9)
	135	+ for i in range(sample_size):
	136	+ key = random.choice(item_score.keys())
	137	+ sample[key] = item_score.pop(key)
	138	+ iteration_user = User(item_score)
	139	+ recommendation = rec.get_recommendation(iteration_user,repo_size)
	140	+ if hasattr(recommendation,"ranking"):
	141	+ ranking = recommendation.ranking
	142	+ real = RecommendationResult(sample)
	143	+ predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
	144	+ evaluation = Evaluation(predicted_20,real,repo_size)
	145	+ p_20.append(evaluation.run(Precision()))
	146	+ predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
	147	+ evaluation = Evaluation(predicted_100,real,repo_size)
	148	+ f05_100.append(evaluation.run(F_score(0.5)))
	149	+ c_20[size] = c_20[size].union(recommendation.ranking[:20])
	150	+ c_100[size] = c_100[size].union(recommendation.ranking[:100])
	151	+ # save summary
	152	+ if p_20:
	153	+ p_20_summary[size].append(sum(p_20)/len(p_20))
	154	+ if f05_100:
	155	+ f05_100_summary[size].append(sum(f05_100)/len(f05_100))
	156	+
	157	+ with open(log_file+"-%s%.3d"%(option_str,size),'a') as f:
	158	+ f.write("%.4f \t%.4f\n" %
	159	+ ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
	160	+
	161	+ # back to main flow
	162	+ coverage_20 = {}
	163	+ coverage_100 = {}
	164	+ with open(comment_20,'a') as f:
	165	+ for size in sizes:
	166	+ coverage_20[size] = len(c_20[size])/float(repo_size)
	167	+ f.write("%3d\t\t%.4f\t\t%.4f\n" %
	168	+ (size,float(sum(p_20_summary[size]))/len(p_20_summary[size]),coverage_20[size]))
	169	+ with open(comment_100,'a') as f:
	170	+ for size in sizes:
	171	+ coverage_100[size] = len(c_100[size])/float(repo_size)
	172	+ f.write("%3d\t\t%.4f\t\t%.4f\n" %
	173	+ (size,float(sum(f05_100_summary[size]))/len(f05_100_summary[size]),coverage_100[size]))
	174	+
	175	+ # plot results summary
	176	+ g = Gnuplot.Gnuplot()
	177	+ g('set style data lines')
	178	+ g('set yrange [0:1.0]')
	179	+ g.xlabel('%s size'%option_str.capitalize())
	180	+ g.title("Setup: %s (threshold 20)" % cfg.strategy)
	181	+ g.plot(Gnuplot.Data(sorted([[k,sum(p_20_summary[k])/len(p_20_summary[k])]
	182	+ for k in p_20_summary.keys()]),title="Precision"),
	183	+ Gnuplot.Data(sorted([[k,coverage_20[k]]
	184	+ for k in coverage_20.keys()]),title="Coverage"))
	185	+ g.hardcopy(graph_20,terminal="png")
	186	+ commands.getoutput("convert -quality 20 %s %s" %
	187	+ (graph_100,graph_20_jpg))
	188	+ g = Gnuplot.Gnuplot()
	189	+ g('set style data lines')
	190	+ g('set yrange [0:1.0]')
	191	+ g.xlabel('%s size'%option_str.capitalize())
	192	+ g.title("Setup: %s (threshold 100)" % cfg.strategy)
	193	+ g.plot(Gnuplot.Data(sorted([[k,sum(f05_100_summary[k])/len(f05_100_summary[k])]
	194	+ for k in f05_100_summary.keys()]),title="F05"),
	195	+ Gnuplot.Data(sorted([[k,coverage_100[k]]
	196	+ for k in coverage_100.keys()]),title="Coverage"))
	197	+ g.hardcopy(graph_100,terminal="png")
	198	+ commands.getoutput("convert -quality 100 %s %s" %
	199	+ (graph_100,graph_100_jpg))
@@ -0,0 +1,44 @@		@@ -0,0 +1,44 @@
	1	+#! /usr/bin/env python
	2	+"""
	3	+ sample-popcon-arch - extract a sample of a specific arch
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+import sys
	22	+sys.path.insert(0,'../')
	23	+import xapian
	24	+import os
	25	+import random
	26	+import sys
	27	+from user import RandomPopcon
	28	+
	29	+if __name__ == '__main__':
	30	+ try:
	31	+ size = int(sys.argv[1])
	32	+ arch = sys.argv[2]
	33	+ popcon_dir = sys.argv[3]
	34	+ pkgs_filter = sys.argv[4]
	35	+ except:
	36	+ print "Usage: sample-popcon-arch size arch popcon_dir pkgs_filter"
	37	+ exit(1)
	38	+
	39	+ sample_file = ("results/misc-popcon/sample-%s-%d" % (arch,size))
	40	+ with open(sample_file,'w') as f:
	41	+ for n in range(1,size+1):
	42	+ user = RandomPopcon(popcon_dir,arch,pkgs_filter)
	43	+ f.write(user.user_id+'\n')
	44	+ print "sample",n