Commit 94efb102510a4e1f84d73385e35ce801856749d9

Authored by Tássia Camões Araújo
2 parents ef8c9733 b33c0cb1
Exists in master and in 1 other branch add_vagrant

Merge branch 'master' of https://github.com/tassia/AppRecommender

@@ -6,10 +6,7 @@ Install dependencies @@ -6,10 +6,7 @@ Install dependencies
6 6
7 # apt-get install \ 7 # apt-get install \
8 python python-xapian python-apt python-cluster python-webpy python-simplejson \ 8 python python-xapian python-apt python-cluster python-webpy python-simplejson \
9 -python-unittest2 python-numpy python-gnuplot \  
10 -apt-xapian-index gnuplot  
11 -  
12 -# cd ./src; git clone https://github.com/rueckstiess/expsuite 9 +python-numpy apt-xapian-index app-install-data python-xdg
13 10
14 11
15 Run AppRecommender web UI 12 Run AppRecommender web UI
@@ -20,4 +17,5 @@ Run AppRecommender web UI @@ -20,4 +17,5 @@ Run AppRecommender web UI
20 17
21 Open a browser and access http://localhost:8080 18 Open a browser and access http://localhost:8080
22 19
  20 +
23 More info at https://github.com/tassia/AppRecommender/wiki 21 More info at https://github.com/tassia/AppRecommender/wiki
src/bin/cross_validation.py
@@ -37,7 +37,7 @@ if __name__ == '__main__': @@ -37,7 +37,7 @@ if __name__ == '__main__':
37 #user = LocalSystem() 37 #user = LocalSystem()
38 #user = RandomPopcon(cfg.popcon_dir) 38 #user = RandomPopcon(cfg.popcon_dir)
39 #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps")) 39 #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
40 - user = PopconSystem("/home/tassia/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623") 40 + user = PopconSystem(os.path.expanduser("~/.app-recommender/popcon-entries/00/0001166d0737c6dffb083071e5ee69f5"))
41 user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps")) 41 user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps"))
42 user.maximal_pkg_profile() 42 user.maximal_pkg_profile()
43 begin_time = datetime.datetime.now() 43 begin_time = datetime.datetime.now()
@@ -48,7 +48,7 @@ if __name__ == '__main__': @@ -48,7 +48,7 @@ if __name__ == '__main__':
48 metrics.append(F_score(0.5)) 48 metrics.append(F_score(0.5))
49 metrics.append(Accuracy()) 49 metrics.append(Accuracy())
50 metrics.append(FPR()) 50 metrics.append(FPR())
51 - validation = CrossValidation(0.9,10,rec,metrics,1) 51 + validation = CrossValidation(0.9,20,rec,metrics,0.005)
52 validation.run(user) 52 validation.run(user)
53 print validation 53 print validation
54 54
src/bin/get_axipkgs.py 0 → 100755
@@ -0,0 +1,42 @@ @@ -0,0 +1,42 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + AppRecommender - A GNU/Linux application recommender
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import os
  23 +import sys
  24 +sys.path.insert(0,'../')
  25 +import xapian
  26 +
  27 +if __name__ == '__main__':
  28 + if len(sys.argv)<2:
  29 + print "Usage: get_axipkgs index_path"
  30 + exit(1)
  31 +
  32 + axi_path = sys.argv[1]
  33 + axi = xapian.Database(axi_path)
  34 + for n in range(1,axi.get_lastdocid()):
  35 + doc = 0
  36 + try:
  37 + doc = axi.get_document(n)
  38 + except:
  39 + pass
  40 + if doc:
  41 + xp_terms = [t.term for t in doc.termlist() if t.term.startswith("XP")]
  42 + print xp_terms[0].lstrip('XP')
src/bin/get_desktop.sh
1 #!/usr/bin/env bash 1 #!/usr/bin/env bash
2 # 2 #
3 -# get_desktop.sh - get packages which have desktop files 3 +# get_desktop.sh - get packages which have desktop files
  4 +#
  5 +# DEPRECATED: use get_axipkgs.py to get this info from axi
4 6
5 cd /usr/share/app-install/desktop 7 cd /usr/share/app-install/desktop
6 sed -ne 's/X-AppInstall-Package=//p' * | sort -u | grep -v kdelibs | grep -v libfm-gtk0 8 sed -ne 's/X-AppInstall-Package=//p' * | sort -u | grep -v kdelibs | grep -v libfm-gtk0
src/bin/get_pkgs_inst.py
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # 2 #
3 # get_pkgs_inst.py - get tuple (package,installation) from popcon results file 3 # get_pkgs_inst.py - get tuple (package,installation) from popcon results file
  4 +#
  5 +# results_file: org/popcon.debian.org/popcon-mail/results
4 6
  7 +import sys
5 from operator import itemgetter 8 from operator import itemgetter
  9 +
6 if __name__ == '__main__': 10 if __name__ == '__main__':
  11 + if len(sys.argv)<2:
  12 + print "Usage: get_pkgs_inst popcon_results_path"
  13 + exit(1)
  14 +
  15 + results_path = sys.argv[1]
7 pkgs_inst = {} 16 pkgs_inst = {}
8 - with open("/root/org/popcon.debian.org/popcon-mail/results") as results: 17 + with open(results_path) as results:
9 for line in results: 18 for line in results:
10 if line.startswith("Package"): 19 if line.startswith("Package"):
11 fields = line.split() 20 fields = line.split()
12 inst = int(fields[2])+int(fields[3])+int(fields[4]) 21 inst = int(fields[2])+int(fields[3])+int(fields[4])
13 - if inst > 20:  
14 - pkgs_inst[fields[1]] = inst 22 + pkgs_inst[fields[1]] = inst
15 sorted_by_inst = sorted(pkgs_inst.items(), key=itemgetter(1)) 23 sorted_by_inst = sorted(pkgs_inst.items(), key=itemgetter(1))
16 for pkg, inst in sorted_by_inst: 24 for pkg, inst in sorted_by_inst:
17 print pkg, inst 25 print pkg, inst
@@ -40,7 +40,7 @@ class Config(Singleton): @@ -40,7 +40,7 @@ class Config(Singleton):
40 ## general options 40 ## general options
41 self.debug = 0 41 self.debug = 0
42 self.verbose = 1 42 self.verbose = 1
43 - self.output = "log" 43 + self.output = "apprec.log"
44 44
45 ## data_source options 45 ## data_source options
46 self.base_dir = os.path.expanduser("/home/tiago/.app-recommender/") 46 self.base_dir = os.path.expanduser("/home/tiago/.app-recommender/")
@@ -103,13 +103,14 @@ class Config(Singleton): @@ -103,13 +103,14 @@ class Config(Singleton):
103 print " -f, --filtersdir=PATH Path to filters directory" 103 print " -f, --filtersdir=PATH Path to filters directory"
104 print " -b, --pkgsfilter=FILTER File containing packages to be considered for recommendations" 104 print " -b, --pkgsfilter=FILTER File containing packages to be considered for recommendations"
105 print " -a, --axi=PATH Path to apt-xapian-index" 105 print " -a, --axi=PATH Path to apt-xapian-index"
106 - print " -e, --dde=URL DDE url"  
107 print " -p, --popconindex=PATH Path to popcon index" 106 print " -p, --popconindex=PATH Path to popcon index"
108 - print " -m, --popcondir=PATH Path to popcon submissions dir"  
109 - print " -u, --indexmode=MODE 'old'|'reindex'|'cluster'|'recluster'"  
110 - print " -l, --clustersdir=PATH Path to popcon clusters dir"  
111 - print " -c, --medoids=k Number of medoids for clustering"  
112 - print " -x, --maxpopcon=k Number of submissions to be considered" 107 + print " -e, --dde=URL DDE url"
  108 + # deprecated options
  109 + #print " -m, --popcondir=PATH Path to popcon submissions dir"
  110 + #print " -u, --indexmode=MODE 'old'|'reindex'|'cluster'|'recluster'"
  111 + #print " -l, --clustersdir=PATH Path to popcon clusters dir"
  112 + #print " -c, --medoids=k Number of medoids for clustering"
  113 + #print " -x, --maxpopcon=k Number of submissions to be considered"
113 print "" 114 print ""
114 print " [ recommender ]" 115 print " [ recommender ]"
115 print " -w, --weight=OPTION Search weighting scheme" 116 print " -w, --weight=OPTION Search weighting scheme"
@@ -123,11 +124,19 @@ class Config(Singleton): @@ -123,11 +124,19 @@ class Config(Singleton):
123 print " bm25 = bm25 weighting scheme" 124 print " bm25 = bm25 weighting scheme"
124 print "" 125 print ""
125 print " [ strategy options ] " 126 print " [ strategy options ] "
126 - print " cb = content-based "  
127 - print " cbt = content-based using only tags as content "  
128 - print " cbd = content-based using only package descriptions as content "  
129 - print " col = collaborative "  
130 - print " colct = collaborative through tags content " 127 + print " cb = content-based, mixed profile"
  128 + print " cbt = content-based, tags only profile"
  129 + print " cbd = content-based, description terms only profile"
  130 + print " cbh = content-based, half-half profile"
  131 + print " cb_eset = cb with eset profiling"
  132 + print " cbt_eset = cbt with eset profiling"
  133 + print " cbd_eset = cbd_eset with eset profiling"
  134 + print " cbh_eset = cbh with eset profiling"
  135 + print " knn = collaborative, tf-idf knn"
  136 + print " knn_plus = collaborative, tf-idf weighted knn"
  137 + print " knn_eset = collaborative, eset knn"
  138 + print " knnco = collaborative through content"
  139 + print " knnco_eset = collaborative through content, eset recommendation"
131 140
132 def read_option(self, section, option): 141 def read_option(self, section, option):
133 """ 142 """
src/evaluation.py
@@ -140,6 +140,29 @@ class FPR(Metric): @@ -140,6 +140,29 @@ class FPR(Metric):
140 return (float(len(evaluation.false_positive))/ 140 return (float(len(evaluation.false_positive))/
141 evaluation.real_negative_len) 141 evaluation.real_negative_len)
142 142
  143 +class MCC(Metric):
  144 + """
  145 + Matthews correlation coefficient.
  146 + """
  147 + def __init__(self):
  148 + """
  149 + Set metric description.
  150 + """
  151 + self.desc = " MCC "
  152 +
  153 + def run(self,evaluation):
  154 + """
  155 + Compute metric.
  156 + """
  157 + VP = len(evaluation.true_positive)
  158 + FP = len(evaluation.false_positive)
  159 + FN = len(evaluation.false_negative)
  160 + VN = evaluation.true_negative_len
  161 + if (VP+FP)==0 or (VP+FN)==0 or (VN+FP)==0 or (VN+FN)==0:
  162 + return 0
  163 + MCC = (((VP*VN)-(FP*FN))/math.sqrt((VP+FP)*(VP+FN)*(VN+FP)*(VN+FN)))
  164 + return MCC
  165 +
143 class F_score(Metric): 166 class F_score(Metric):
144 """ 167 """
145 Classification accuracy metric which correlates precision and recall into an 168 Classification accuracy metric which correlates precision and recall into an
src/experiments/README
1 -Experiments handled by expsuite:  
2 -https://github.com/rueckstiess/expsuite 1 +AppRecommender experiments and tests
  2 +---------------------------------------
  3 +
  4 +Install dependencies:
  5 +
  6 +# apt-get install \
  7 +python-unittest2 python-gnuplot gnuplot
  8 +
  9 +# cd ./src; git clone https://github.com/rueckstiess/expsuite (deprecated tests)
src/experiments/deprecated/k-suite.py 0 → 100755
@@ -0,0 +1,186 @@ @@ -0,0 +1,186 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + k-suite - experiment different neighborhood sizes
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +def plot_roc(k,roc_points,log_file):
  35 + g = Gnuplot.Gnuplot()
  36 + g('set style data points')
  37 + g.xlabel('False Positive Rate')
  38 + g.ylabel('True Positive Rate')
  39 + g('set xrange [0:1.0]')
  40 + g('set yrange [0:1.0]')
  41 + g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
  42 + g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
  43 + Gnuplot.Data(roc_points))
  44 + g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
  45 + g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
  46 +
  47 +def plot_summary(precision,f05,mcc,log_file):
  48 + g = Gnuplot.Gnuplot()
  49 + g('set style data lines')
  50 + g.xlabel('Neighborhood (k)')
  51 + g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
  52 + g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
  53 + Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
  54 + Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
  55 + g.hardcopy(log_file+(".png"),terminal="png")
  56 + g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
  57 +
  58 +class ExperimentResults:
  59 + def __init__(self,repo_size):
  60 + self.repository_size = repo_size
  61 + self.precision = []
  62 + self.recall = []
  63 + self.fpr = []
  64 + self.f05 = []
  65 + self.mcc = []
  66 +
  67 + def add_result(self,ranking,sample):
  68 + predicted = RecommendationResult(dict.fromkeys(ranking,1))
  69 + real = RecommendationResult(sample)
  70 + evaluation = Evaluation(predicted,real,self.repository_size)
  71 + self.precision.append(evaluation.run(Precision()))
  72 + self.recall.append(evaluation.run(Recall()))
  73 + self.fpr.append(evaluation.run(FPR()))
  74 + self.f05.append(evaluation.run(F_score(0.5)))
  75 + self.mcc.append(evaluation.run(MCC()))
  76 +
  77 + def get_roc_point(self):
  78 + tpr = self.recall
  79 + fpr = self.fpr
  80 + if not tpr or not fpr:
  81 + return [0,0]
  82 + return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
  83 +
  84 + def get_precision_summary(self):
  85 + if not self.precision: return 0
  86 + return sum(self.precision)/len(self.precision)
  87 +
  88 + def get_f05_summary(self):
  89 + if not self.f05: return 0
  90 + return sum(self.f05)/len(self.f05)
  91 +
  92 + def get_mcc_summary(self):
  93 + if not self.mcc: return 0
  94 + return sum(self.mcc)/len(self.mcc)
  95 +
  96 +if __name__ == '__main__':
  97 + if len(sys.argv)<3:
  98 + print "Usage: k-suite strategy_str sample_file"
  99 + exit(1)
  100 + threshold = 20
  101 + iterations = 30
  102 + neighbors = [3,5,10,50,100,150,200,300,400,500]
  103 + cfg = Config()
  104 + cfg.strategy = sys.argv[1]
  105 + sample_file = sys.argv[2]
  106 + population_sample = []
  107 + with open(sample_file,'r') as f:
  108 + for line in f.readlines():
  109 + user_id = line.strip('\n')
  110 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  111 + # setup dictionaries and files
  112 + roc_summary = {}
  113 + recommended = {}
  114 + precision_summary = {}
  115 + f05_summary = {}
  116 + mcc_summary = {}
  117 + sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
  118 + if not os.path.exists(sample_dir):
  119 + os.makedirs(sample_dir)
  120 + log_file = os.path.join(sample_dir,cfg.strategy)
  121 + with open(log_file,'w') as f:
  122 + f.write("# %s\n\n" % sample_file.split('/')[-1])
  123 + f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
  124 + (cfg.strategy,threshold,iterations))
  125 + f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
  126 +
  127 + for k in neighbors:
  128 + roc_summary[k] = []
  129 + recommended[k] = set()
  130 + precision_summary[k] = []
  131 + f05_summary[k] = []
  132 + mcc_summary[k] = []
  133 + with open(log_file+"-k%.3d"%k,'w') as f:
  134 + f.write("# %s\n\n" % sample_file.split('/')[-1])
  135 + f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
  136 + f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
  137 +
  138 + # main loop per user
  139 + for submission_file in population_sample:
  140 + user = PopconSystem(submission_file)
  141 + user.filter_pkg_profile(cfg.pkgs_filter)
  142 + user.maximal_pkg_profile()
  143 + for k in neighbors:
  144 + cfg.k_neighbors = k
  145 + rec = Recommender(cfg)
  146 + repo_size = rec.items_repository.get_doccount()
  147 + results = ExperimentResults(repo_size)
  148 + # n iterations for same recommender and user
  149 + for n in range(iterations):
  150 + # Fill sample profile
  151 + profile_len = len(user.pkg_profile)
  152 + item_score = {}
  153 + for pkg in user.pkg_profile:
  154 + item_score[pkg] = user.item_score[pkg]
  155 + sample = {}
  156 + sample_size = int(profile_len*0.9)
  157 + for i in range(sample_size):
  158 + key = random.choice(item_score.keys())
  159 + sample[key] = item_score.pop(key)
  160 + iteration_user = User(item_score)
  161 + recommendation = rec.get_recommendation(iteration_user,threshold)
  162 + if hasattr(recommendation,"ranking"):
  163 + results.add_result(recommendation.ranking,sample)
  164 + recommended[k] = recommended[k].union(recommendation.ranking)
  165 + # save summary
  166 + roc_point = results.get_roc_point()
  167 + roc_summary[k].append(roc_point)
  168 + precision = results.get_precision_summary()
  169 + precision_summary[k].append(precision)
  170 + f05 = results.get_f05_summary()
  171 + f05_summary[k].append(f05)
  172 + mcc = results.get_mcc_summary()
  173 + mcc_summary[k].append(mcc)
  174 + with open(log_file+"-k%.3d"%k,'a') as f:
  175 + f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
  176 + (roc_point[0],roc_point[1],precision,f05,mcc))
  177 + # back to main flow
  178 + with open(log_file,'a') as f:
  179 + plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
  180 + for k in neighbors:
  181 + coverage = len(recommended[size])/float(repo_size)
  182 + f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
  183 + (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
  184 + float(sum(f05_summary[k]))/len(f05_summary[k]),
  185 + float(sum(mcc_summary[k]))/len(mcc_summary[k])))
  186 + plot_roc(k,roc_summary[k],log_file)
src/experiments/deprecated/strategies-suite.py 0 → 100755
@@ -0,0 +1,274 @@ @@ -0,0 +1,274 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +
  33 +#iterations = 3
  34 +#sample_proportions = [0.9]
  35 +#weighting = [('bm25',1.2)]
  36 +#collaborative = ['knn']
  37 +#content_based = []
  38 +#hybrid = ['knnco']
  39 +#profile_size = [50,100]
  40 +#popcon_size = ["1000"]
  41 +#neighbors = [50]
  42 +
  43 +iterations = 10
  44 +sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
  45 +weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
  46 +content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  47 +collaborative = ['knn_eset','knn','knn_plus']
  48 +hybrid = ['knnco','knnco_eset']
  49 +
  50 +profile_size = range(20,100,20)
  51 +#popcon_size = [1000,10000,50000,'full']
  52 +neighbors = range(10,510,50)
  53 +
  54 +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
  55 + # Write recall log
  56 + output = open(("%s-%d" % (log_file,n)),'w')
  57 + output.write("# %s-n\n" % label["description"])
  58 + output.write("# %s-%d\n" % (label["values"],n))
  59 + output.write("\n%d %d %d\n" % \
  60 + (repo_size,profile_size,len(sample)))
  61 + if hasattr(recommendation,"ranking"):
  62 + notfound = []
  63 + ranks = []
  64 + for pkg in sample.keys():
  65 + if pkg in recommendation.ranking:
  66 + ranks.append(recommendation.ranking.index(pkg))
  67 + else:
  68 + notfound.append(pkg)
  69 + for r in sorted(ranks):
  70 + output.write(str(r)+"\n")
  71 + if notfound:
  72 + output.write("Out of recommendation:\n")
  73 + for pkg in notfound:
  74 + output.write(pkg+"\n")
  75 + output.close()
  76 +
  77 +def plot_summary(precision,recall,f1,f05,accuracy,log_file):
  78 + # Plot metrics summary
  79 + g = Gnuplot.Gnuplot()
  80 + g('set style data lines')
  81 + g.xlabel('Recommendation size')
  82 + g.title("Setup: %s" % log_file.split("/")[-1])
  83 + g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
  84 + Gnuplot.Data(precision,title="Precision"),
  85 + Gnuplot.Data(recall,title="Recall"),
  86 + Gnuplot.Data(f1,title="F_1"),
  87 + Gnuplot.Data(f05,title="F_0.5"))
  88 + g.hardcopy(log_file+".png",terminal="png")
  89 + g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
  90 + g('set logscale x')
  91 + g('replot')
  92 + g.hardcopy(log_file+"-logscale.png",terminal="png")
  93 + g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
  94 +
  95 +def get_label(cfg,sample_proportion):
  96 + label = {}
  97 + if cfg.strategy in content_based:
  98 + label["description"] = "strategy-filter-profile-k1_bm25-sample"
  99 + label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
  100 + (cfg.strategy,cfg.profile_size,
  101 + cfg.pkgs_filter.split("/")[-1],
  102 + cfg.bm25_k1,sample_proportion))
  103 + elif cfg.strategy in collaborative:
  104 + label["description"] = "strategy-knn-filter-k1_bm25-sample"
  105 + label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
  106 + (cfg.strategy,cfg.k_neighbors,
  107 + cfg.pkgs_filter.split("/")[-1],
  108 + cfg.bm25_k1,sample_proportion))
  109 + elif cfg.strategy in hybrid:
  110 + label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
  111 + label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
  112 + (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
  113 + cfg.pkgs_filter.split("/")[-1],
  114 + cfg.bm25_k1,sample_proportion))
  115 + else:
  116 + print "Unknown strategy"
  117 + return label
  118 +
  119 +class ExperimentResults:
  120 + def __init__(self,repo_size):
  121 + self.repository_size = repo_size
  122 + self.accuracy = {}
  123 + self.precision = {}
  124 + self.recall = {}
  125 + self.f1 = {}
  126 + self.f05 = {}
  127 + points = [1]+range(10,200,10)+range(200,self.repository_size,100)
  128 + for size in points:
  129 + self.accuracy[size] = []
  130 + self.precision[size] = []
  131 + self.recall[size] = []
  132 + self.f1[size] = []
  133 + self.f05[size] = []
  134 +
  135 + def add_result(self,ranking,sample):
  136 + for size in self.accuracy.keys():
  137 + predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
  138 + real = RecommendationResult(sample)
  139 + evaluation = Evaluation(predicted,real,self.repository_size)
  140 + self.accuracy[size].append(evaluation.run(Accuracy()))
  141 + self.precision[size].append(evaluation.run(Precision()))
  142 + self.recall[size].append(evaluation.run(Recall()))
  143 + self.f1[size].append(evaluation.run(F_score(1)))
  144 + self.f05[size].append(evaluation.run(F_score(0.5)))
  145 +
  146 + def get_precision_summary(self):
  147 + summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
  148 + return sorted(summary)
  149 +
  150 + def get_recall_summary(self):
  151 + summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
  152 + return sorted(summary)
  153 +
  154 + def get_f1_summary(self):
  155 + summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
  156 + return sorted(summary)
  157 +
  158 + def get_f05_summary(self):
  159 + summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
  160 + return sorted(summary)
  161 +
  162 + def get_accuracy_summary(self):
  163 + summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
  164 + return sorted(summary)
  165 +
  166 + def best_precision(self):
  167 + size = max(self.precision, key = lambda x: max(self.precision[x]))
  168 + return (size,max(self.precision[size]))
  169 +
  170 + def best_f1(self):
  171 + size = max(self.f1, key = lambda x: max(self.f1[x]))
  172 + return (size,max(self.f1[size]))
  173 +
  174 + def best_f05(self):
  175 + size = max(self.f05, key = lambda x: max(self.f05[x]))
  176 + return (size,max(self.f05[size]))
  177 +
  178 +def run_strategy(cfg,user):
  179 + for weight in weighting:
  180 + cfg.weight = weight[0]
  181 + cfg.bm25_k1 = weight[1]
  182 + rec = Recommender(cfg)
  183 + repo_size = rec.items_repository.get_doccount()
  184 + for proportion in sample_proportions:
  185 + results = ExperimentResults(repo_size)
  186 + label = get_label(cfg,proportion)
  187 + log_file = "results/strategies/"+label["values"]
  188 + for n in range(iterations):
  189 + # Fill sample profile
  190 + profile_size = len(user.pkg_profile)
  191 + item_score = {}
  192 + for pkg in user.pkg_profile:
  193 + item_score[pkg] = user.item_score[pkg]
  194 + sample = {}
  195 + sample_size = int(profile_size*proportion)
  196 + for i in range(sample_size):
  197 + key = random.choice(item_score.keys())
  198 + sample[key] = item_score.pop(key)
  199 + iteration_user = User(item_score)
  200 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  201 + write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
  202 + if hasattr(recommendation,"ranking"):
  203 + results.add_result(recommendation.ranking,sample)
  204 + with open(log_file,'w') as f:
  205 + precision_10 = sum(results.precision[10])/len(results.precision[10])
  206 + f1_10 = sum(results.f1[10])/len(results.f1[10])
  207 + f05_10 = sum(results.f05[10])/len(results.f05[10])
  208 + f.write("# %s\n# %s\n\ncoverage %d\n\n" %
  209 + (label["description"],label["values"],recommendation.size))
  210 + f.write("# best results (recommendation size; metric)\n")
  211 + f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
  212 + (results.best_precision()[0],results.best_precision()[1],
  213 + results.best_f1()[0],results.best_f1()[1],
  214 + results.best_f05()[0],results.best_f05()[1]))
  215 + f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
  216 + (precision_10,f1_10,f05_10))
  217 + precision = results.get_precision_summary()
  218 + recall = results.get_recall_summary()
  219 + f1 = results.get_f1_summary()
  220 + f05 = results.get_f05_summary()
  221 + accuracy = results.get_accuracy_summary()
  222 + plot_summary(precision,recall,f1,f05,accuracy,log_file)
  223 +
  224 +def run_content(user,cfg):
  225 + for strategy in content_based:
  226 + cfg.strategy = strategy
  227 + for size in profile_size:
  228 + cfg.profile_size = size
  229 + run_strategy(cfg,user)
  230 +
  231 +def run_collaborative(user,cfg):
  232 + popcon_desktopapps = cfg.popcon_desktopapps
  233 + popcon_programs = cfg.popcon_programs
  234 + for strategy in collaborative:
  235 + cfg.strategy = strategy
  236 + for k in neighbors:
  237 + cfg.k_neighbors = k
  238 + #for size in popcon_size:
  239 + # if size:
  240 + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
  241 + # cfg.popcon_programs = popcon_programs+"_"+size
  242 + run_strategy(cfg,user)
  243 +
  244 +def run_hybrid(user,cfg):
  245 + popcon_desktopapps = cfg.popcon_desktopapps
  246 + popcon_programs = cfg.popcon_programs
  247 + for strategy in hybrid:
  248 + cfg.strategy = strategy
  249 + for k in neighbors:
  250 + cfg.k_neighbors = k
  251 + #for size in popcon_size:
  252 + # if size:
  253 + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
  254 + # cfg.popcon_programs = popcon_programs+"_"+size
  255 + for size in profile_size:
  256 + cfg.profile_size = size
  257 + run_strategy(cfg,user)
  258 +
  259 +if __name__ == '__main__':
  260 + #user = LocalSystem()
  261 + #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
  262 +
  263 + cfg = Config()
  264 + user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
  265 + #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
  266 + user.filter_pkg_profile(cfg.pkgs_filter)
  267 + user.maximal_pkg_profile()
  268 +
  269 + if "content" in sys.argv or len(sys.argv)<2:
  270 + run_content(user,cfg)
  271 + if "collaborative" in sys.argv or len(sys.argv)<2:
  272 + run_collaborative(user,cfg)
  273 + if "hybrid" in sys.argv or len(sys.argv)<2:
  274 + run_hybrid(user,cfg)
src/experiments/experiments.cfg
@@ -1,27 +0,0 @@ @@ -1,27 +0,0 @@
1 -[DEFAULT]  
2 -repetitions = 1  
3 -iterations = 10  
4 -path = 'results'  
5 -experiment = 'grid'  
6 -weight = ['bm25', 'trad']  
7 -;profile_size = range(10,100,10)  
8 -;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]  
9 -sample = [0.6, 0.7, 0.8, 0.9]  
10 -  
11 -[content]  
12 -strategy = ['cb','cbt','cbd']  
13 -  
14 -[clustering]  
15 -experiment = 'single'  
16 -;iterations = 4  
17 -;medoids = range(2,6)  
18 -iterations = 6  
19 -medoids = [100,500,1000,5000,10000,50000]  
20 -;disabled for this experiment  
21 -weight = 0  
22 -profile_size = 0  
23 -sample = 0  
24 -  
25 -[colaborative]  
26 -users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"]  
27 -neighbors = range(10,1010,50)  
src/experiments/extract-sample-db.py 0 → 100755
@@ -0,0 +1,49 @@ @@ -0,0 +1,49 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + sample-popcon - extract a sample from popcon population
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import xapian
  23 +import os
  24 +import random
  25 +import sys
  26 +
  27 +if __name__ == '__main__':
  28 + try:
  29 + sample_file = sys.argv[1]
  30 + popcon = xapian.WritableDatabase(sys.argv[2],xapian.DB_OPEN)
  31 + except:
  32 + print "Usage: extract-sample-db sample_file popcon_index"
  33 + exit(1)
  34 + enquire = xapian.Enquire(popcon)
  35 + print sample_file.split("/")
  36 + new_popcon = xapian.WritableDatabase(sys.argv[2]+"-"+sample_file.split("/")[-1],xapian.DB_CREATE_OR_OVERWRITE)
  37 + print ("Popcon repository size: %d" % popcon.get_doccount())
  38 + for submission in open(sample_file):
  39 + print "ID"+submission.strip()
  40 + query = xapian.Query("ID"+submission.strip())
  41 + enquire.set_query(query)
  42 + mset = enquire.get_mset(0,20)
  43 + for m in mset:
  44 + print "Adding doc %s"%m.docid
  45 + new_popcon.add_document(popcon.get_document(m.docid))
  46 + print "Removing doc %s"%m.docid
  47 + popcon.delete_document(m.docid)
  48 + print ("Popcon repository size: %d" % popcon.get_doccount())
  49 + print ("Popcon repository size: %d" % new_popcon.get_doccount())
src/experiments/hybrid.py 0 → 100755
@@ -0,0 +1,202 @@ @@ -0,0 +1,202 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + hybrid-suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +#hybrid_strategies = ['knnco','knnco_eset']
  35 +
  36 +if __name__ == '__main__':
  37 + if len(sys.argv)<2:
  38 + print "Usage: hybrid strategy sample_file"
  39 + exit(1)
  40 +
  41 + iterations = 20
  42 + profile_size = [10,40,70,100,170,240]
  43 + neighbor_size = [3,10,50,70,100,150,200]
  44 +
  45 + #iterations = 1
  46 + #profile_size = [10,20,30]
  47 + #neighbor_size = [10,20,30]
  48 +
  49 + cfg = Config()
  50 + population_sample = []
  51 + strategy = sys.argv[1]
  52 + sample_file = sys.argv[2]
  53 + sample_str = sample_file.split('/')[-1]
  54 + with open(sample_file,'r') as f:
  55 + for line in f.readlines():
  56 + user_id = line.strip('\n')
  57 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  58 + sample_dir = ("results/hybrid/%s/%s" % (sample_str,strategy))
  59 + if not os.path.exists(sample_dir):
  60 + os.makedirs(sample_dir)
  61 +
  62 + cfg.strategy = strategy
  63 + p_10_summary = {}
  64 + f05_100_summary = {}
  65 + c_10 = {}
  66 + c_100 = {}
  67 +
  68 + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
  69 + graph_10 = {}
  70 + graph_100 = {}
  71 + graph_10_jpg = {}
  72 + graph_100_jpg = {}
  73 + comment_10 = {}
  74 + comment_100 = {}
  75 + for k in neighbor_size:
  76 + graph_10[k] = log_file+("-neighborhood%.3d-010.png"%k)
  77 + graph_100[k] = log_file+("-neighborhood%.3d-100.png"%k)
  78 + graph_10_jpg[k] = graph_10[k].strip(".png")+".jpg"
  79 + graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
  80 + comment_10[k] = graph_10_jpg[k]+".comment"
  81 + comment_100[k] = graph_100_jpg[k]+".comment"
  82 +
  83 + with open(comment_10[k],'w') as f:
  84 + f.write("# %s\n" % sample_str)
  85 + f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
  86 + (cfg.strategy,iterations))
  87 + f.write("# neighborhood\tprofile\tmean_p_10\tdev_p_10\tc_10\n\n")
  88 + with open(comment_100[k],'w') as f:
  89 + f.write("# %s\n" % sample_str)
  90 + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
  91 + (cfg.strategy,iterations))
  92 + f.write("# neighborhood\tprofile\tmean_f05_100\tdev_f05_100\tc_100\n\n")
  93 +
  94 + c_10[k] = {}
  95 + c_100[k] = {}
  96 + p_10_summary[k] = {}
  97 + f05_100_summary[k] = {}
  98 + for size in profile_size:
  99 + c_10[k][size] = set()
  100 + c_100[k][size] = set()
  101 + p_10_summary[k][size] = []
  102 + f05_100_summary[k][size] = []
  103 + with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'w') as f:
  104 + f.write("# %s\n" % sample_str)
  105 + f.write("# strategy %s-neighborhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
  106 + f.write("# p_10\t\tf05_100\n\n")
  107 +
  108 + # main loop per user
  109 + for submission_file in population_sample:
  110 + user = PopconSystem(submission_file)
  111 + user.filter_pkg_profile(cfg.pkgs_filter)
  112 + user.maximal_pkg_profile()
  113 + for k in neighbor_size:
  114 + cfg.k_neighbors = k
  115 + for size in profile_size:
  116 + cfg.profile_size = size
  117 + rec = Recommender(cfg)
  118 + repo_size = rec.items_repository.get_doccount()
  119 + p_10 = []
  120 + f05_100 = []
  121 + for n in range(iterations):
  122 + # Fill sample profile
  123 + profile_len = len(user.pkg_profile)
  124 + item_score = {}
  125 + for pkg in user.pkg_profile:
  126 + item_score[pkg] = user.item_score[pkg]
  127 + sample = {}
  128 + sample_size = int(profile_len*0.9)
  129 + for i in range(sample_size):
  130 + key = random.choice(item_score.keys())
  131 + sample[key] = item_score.pop(key)
  132 + iteration_user = User(item_score)
  133 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  134 + if hasattr(recommendation,"ranking"):
  135 + ranking = recommendation.ranking
  136 + real = RecommendationResult(sample)
  137 + predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
  138 + evaluation = Evaluation(predicted_10,real,repo_size)
  139 + p_10.append(evaluation.run(Precision()))
  140 + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
  141 + evaluation = Evaluation(predicted_100,real,repo_size)
  142 + f05_100.append(evaluation.run(F_score(0.5)))
  143 + c_10[k][size] = c_10[k][size].union(recommendation.ranking[:10])
  144 + c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
  145 + # save summary
  146 + if p_10:
  147 + p_10_summary[k][size].append(numpy.mean(p_10))
  148 + if f05_100:
  149 + f05_100_summary[k][size].append(numpy.mean(f05_100))
  150 +
  151 + with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'a') as f:
  152 + f.write("%.4f\t\t%.4f\n" %
  153 + (numpy.mean(p_10),numpy.mean(f05_100)))
  154 +
  155 + # back to main flow
  156 + coverage_10 = {}
  157 + coverage_100 = {}
  158 + for k in neighbor_size:
  159 + coverage_10[k] = {}
  160 + coverage_100[k] = {}
  161 + with open(comment_10[k],'a') as f:
  162 + for size in profile_size:
  163 + coverage_10[k][size] = len(c_10[k][size])/float(repo_size)
  164 + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
  165 + (k,size,numpy.mean(p_10_summary[k][size]),
  166 + numpy.std(p_10_summary[k][size]),coverage_10[k][size]))
  167 + with open(comment_100[k],'a') as f:
  168 + for size in profile_size:
  169 + coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
  170 + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
  171 + (k,size,numpy.mean(f05_100_summary[k][size]),
  172 + numpy.std(f05_100_summary[k][size]),coverage_100[k][size]))
  173 +
  174 + for k in neighbor_size:
  175 + # plot results summary
  176 + g = Gnuplot.Gnuplot()
  177 + g('set style data lines')
  178 + g('set yrange [0:1.0]')
  179 + g.xlabel('Profile size')
  180 + g.title("Setup: %s-neighborhood%3d (threshold 10)" % (cfg.strategy,k))
  181 + g.plot(Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
  182 + for i in p_10_summary[k].keys()]),title="Precision"),
  183 + Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
  184 + for i in p_10_summary[k].keys()]),title="Deviation",
  185 + with_="yerrorbar lt 2 pt 6"),
  186 + Gnuplot.Data(sorted([[i,coverage_10[k][i]]
  187 + for i in coverage_10[k].keys()]),title="Coverage"))
  188 + g.hardcopy(graph_10[k],terminal="png")
  189 +
  190 + g = Gnuplot.Gnuplot()
  191 + g('set style data lines')
  192 + g('set yrange [0:1.0]')
  193 + g.xlabel('Profile size')
  194 + g.title("Setup: %s-neighborhood%3d (threshold 100)" % (cfg.strategy,k))
  195 + g.plot(Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
  196 + for i in f05_100_summary[k].keys()]),title="F05"),
  197 + Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
  198 + for i in f05_100_summary[k].keys()]),title="Deviation",
  199 + with_="yerrorbar lt 2 pt 6"),
  200 + Gnuplot.Data(sorted([[i,coverage_100[k][i]]
  201 + for i in coverage_100[k].keys()]),title="Coverage"))
  202 + g.hardcopy(graph_100[k],terminal="png")
src/experiments/k-suite.py
@@ -1,152 +0,0 @@ @@ -1,152 +0,0 @@
1 -#!/usr/bin/env python  
2 -"""  
3 - recommender suite - recommender experiments suite  
4 -"""  
5 -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"  
6 -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"  
7 -__license__ = """  
8 - This program is free software: you can redistribute it and/or modify  
9 - it under the terms of the GNU General Public License as published by  
10 - the Free Software Foundation, either version 3 of the License, or  
11 - (at your option) any later version.  
12 -  
13 - This program is distributed in the hope that it will be useful,  
14 - but WITHOUT ANY WARRANTY; without even the implied warranty of  
15 - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
16 - GNU General Public License for more details.  
17 -  
18 - You should have received a copy of the GNU General Public License  
19 - along with this program. If not, see <http://www.gnu.org/licenses/>.  
20 -"""  
21 -  
22 -import sys  
23 -sys.path.insert(0,'../')  
24 -from config import Config  
25 -from data import PopconXapianIndex, PopconSubmission  
26 -from recommender import Recommender  
27 -from user import LocalSystem, User  
28 -from evaluation import *  
29 -import logging  
30 -import random  
31 -import Gnuplot  
32 -import numpy  
33 -  
34 -def plot_roc(p,roc_points,log_file):  
35 - g = Gnuplot.Gnuplot()  
36 - g('set style data points')  
37 - g.xlabel('False Positive Rate')  
38 - g.ylabel('True Positive Rate')  
39 - g('set xrange [0:1.0]')  
40 - g('set yrange [0:1.0]')  
41 - g.title("Setup: %s" % log_file.split("/")[-1])  
42 - g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),  
43 - Gnuplot.Data(roc_points,title="k %d"%k))  
44 - g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")  
45 - g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)  
46 -  
47 -class ExperimentResults:  
48 - def __init__(self,repo_size):  
49 - self.repository_size = repo_size  
50 - self.precision = []  
51 - self.recall = []  
52 - self.fpr = []  
53 -  
54 - def add_result(self,ranking,sample):  
55 - predicted = RecommendationResult(dict.fromkeys(ranking,1))  
56 - real = RecommendationResult(sample)  
57 - evaluation = Evaluation(predicted,real,self.repository_size)  
58 - self.precision.append(evaluation.run(Precision()))  
59 - self.recall.append(evaluation.run(Recall()))  
60 - self.fpr.append(evaluation.run(FPR()))  
61 -  
62 - # Average ROC by threshold (whici is the size)  
63 - def get_roc_point(self):  
64 - tpr = self.recall  
65 - fpr = self.fpr  
66 - return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]  
67 -  
68 - def get_precision_summary(self):  
69 - return sum(self.precision)/len(self.precision)  
70 -  
71 - def get_recall_summary(self):  
72 - return sum(self.recall)/len(self.recall)  
73 -  
74 -if __name__ == '__main__':  
75 - # experiment parameters  
76 - threshold = 20  
77 - iterations = 30  
78 - sample_file = "results/misc-popcon/sample-050-100"  
79 - neighbors = [3,5,10,50,100,150,200,300,400,500]  
80 - cfg = Config()  
81 - cfg.strategy = "knn"  
82 - print cfg.popcon_index  
83 - sample = []  
84 - with open(sample_file,'r') as f:  
85 - for line in f.readlines():  
86 - user_id = line.strip('\n')  
87 - sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))  
88 - # setup dictionaries and files  
89 - roc_points = {}  
90 - recommended = {}  
91 - precisions = {}  
92 - aucs = {}  
93 - log_file = "results/k-suite/sample-050-100/%s" % (cfg.strategy)  
94 - for k in neighbors:  
95 - roc_points[k] = []  
96 - recommended[k] = set()  
97 - precisions[k] = []  
98 - aucs[k] = []  
99 - with open(log_file+"-k%.3d"%k,'w') as f:  
100 - f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))  
101 - f.write("# roc_point \tp(20) \tauc\n\n")  
102 - # main loop per user  
103 - for submission_file in sample:  
104 - user = PopconSystem(submission_file)  
105 - user.filter_pkg_profile(cfg.pkgs_filter)  
106 - user.maximal_pkg_profile()  
107 - for k in neighbors:  
108 - cfg.k_neighbors = k  
109 - rec = Recommender(cfg)  
110 - repo_size = rec.items_repository.get_doccount()  
111 - results = ExperimentResults(repo_size)  
112 - # n iterations for same recommender and user  
113 - for n in range(iterations):  
114 - # Fill sample profile  
115 - profile_size = len(user.pkg_profile)  
116 - item_score = {}  
117 - for pkg in user.pkg_profile:  
118 - item_score[pkg] = user.item_score[pkg]  
119 - sample = {}  
120 - sample_size = int(profile_size*0.9)  
121 - for i in range(sample_size):  
122 - key = random.choice(item_score.keys())  
123 - sample[key] = item_score.pop(key)  
124 - iteration_user = User(item_score)  
125 - recommendation = rec.get_recommendation(iteration_user,threshold)  
126 - if hasattr(recommendation,"ranking"):  
127 - results.add_result(recommendation.ranking,sample)  
128 - print "ranking",recommendation.ranking  
129 - print "recommended_%d"%k,recommended[k]  
130 - recommended[k] = recommended[k].union(recommendation.ranking)  
131 - print recommended[k]  
132 - # save summary  
133 - roc_point = results.get_roc_point()  
134 - auc = numpy.trapz(y=[0,roc_point[1],1],x=[0,roc_point[0],1])  
135 - p_20 = results.get_precision_summary()  
136 - roc_points[k].append(roc_point)  
137 - aucs[k].append(auc)  
138 - precisions[k].append(p_20)  
139 - with open(log_file+"-k%.3d"%k,'a') as f:  
140 - f.write("%s \t%.2f \t%.4f\n" % (str(roc_point),p_20,auc))  
141 - # back to main flow  
142 - with open(log_file,'w') as f:  
143 - f.write("# k coverage \tp(20) \tauc\n\n")  
144 - for k in neighbors:  
145 - print "len_recommended_%d"%k,len(recommended[k])  
146 - print "repo_size",repo_size  
147 - coverage = len(recommended[k])/float(repo_size)  
148 - print coverage  
149 - f.write("%d \t%.2f \t%.2f \t%.2fi\n" %  
150 - (k,coverage,float(sum(precisions[k]))/len(precisions[k]),  
151 - float(sum(aucs[k]))/len(aucs[k])))  
152 - plot_roc(k,roc_points[k],log_file)  
src/experiments/legacy/clustering-suite.py
@@ -1,51 +0,0 @@ @@ -1,51 +0,0 @@
1 -#!/usr/bin/env python  
2 -"""  
3 - recommender suite - recommender experiments suite  
4 -"""  
5 -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"  
6 -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"  
7 -__license__ = """  
8 - This program is free software: you can redistribute it and/or modify  
9 - it under the terms of the GNU General Public License as published by  
10 - the Free Software Foundation, either version 3 of the License, or  
11 - (at your option) any later version.  
12 -  
13 - This program is distributed in the hope that it will be useful,  
14 - but WITHOUT ANY WARRANTY; without even the implied warranty of  
15 - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
16 - GNU General Public License for more details.  
17 -  
18 - You should have received a copy of the GNU General Public License  
19 - along with this program. If not, see <http://www.gnu.org/licenses/>.  
20 -"""  
21 -  
22 -import sys  
23 -import os  
24 -sys.path.insert(0,'../')  
25 -from config import Config  
26 -from data import PopconXapianIndex, PopconSubmission  
27 -from recommender import Recommender  
28 -from user import LocalSystem, User  
29 -from evaluation import *  
30 -import logging  
31 -import random  
32 -import Gnuplot  
33 -  
34 -if __name__ == '__main__':  
35 -  
36 - cfg = Config()  
37 - cfg.index_mode = "recluster"  
38 - logging.info("Starting clustering experiments")  
39 - logging.info("Medoids: %d\t Max popcon:%d" % (cfg.k_medoids,cfg.max_popcon))  
40 - cfg.popcon_dir = os.path.expanduser("~/org/popcon.debian.org/popcon-mail/popcon-entries/")  
41 - cfg.popcon_index = cfg.popcon_index+("_%dmedoids%dmax" %  
42 - (cfg.k_medoids,cfg.max_popcon))  
43 - cfg.clusters_dir = cfg.clusters_dir+("_%dmedoids%dmax" %  
44 - (cfg.k_medoids,cfg.max_popcon))  
45 - pxi = PopconXapianIndex(cfg)  
46 - logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion)  
47 - # Write clustering log  
48 - output = open(("results/clustering/%dmedoids%dmax" % (cfg.k_medoids,cfg.max_popcon)),'w')  
49 - output.write("# k_medoids\tmax_popcon\tdispersion\n")  
50 - output.write("%d %f\n" % (cfg.k_medoids,cfg.max_popcon,pxi.cluster_dispersion))  
51 - output.close()  
src/experiments/legacy/experiments.cfg
@@ -1,27 +0,0 @@ @@ -1,27 +0,0 @@
1 -[DEFAULT]  
2 -repetitions = 1  
3 -iterations = 10  
4 -path = 'results'  
5 -experiment = 'grid'  
6 -weight = ['bm25', 'trad']  
7 -;profile_size = range(10,100,10)  
8 -;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]  
9 -sample = [0.6, 0.7, 0.8, 0.9]  
10 -  
11 -[content]  
12 -strategy = ['cb','cbt','cbd']  
13 -  
14 -[clustering]  
15 -experiment = 'single'  
16 -;iterations = 4  
17 -;medoids = range(2,6)  
18 -iterations = 6  
19 -medoids = [100,500,1000,5000,10000,50000]  
20 -;disabled for this experiment  
21 -weight = 0  
22 -profile_size = 0  
23 -sample = 0  
24 -  
25 -[colaborative]  
26 -users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"]  
27 -neighbors = range(10,1010,50)  
src/experiments/legacy/runner.py
@@ -1,171 +0,0 @@ @@ -1,171 +0,0 @@
1 -#!/usr/bin/env python  
2 -"""  
3 - recommender suite - recommender experiments suite  
4 -"""  
5 -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"  
6 -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"  
7 -__license__ = """  
8 - This program is free software: you can redistribute it and/or modify  
9 - it under the terms of the GNU General Public License as published by  
10 - the Free Software Foundation, either version 3 of the License, or  
11 - (at your option) any later version.  
12 -  
13 - This program is distributed in the hope that it will be useful,  
14 - but WITHOUT ANY WARRANTY; without even the implied warranty of  
15 - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
16 - GNU General Public License for more details.  
17 -  
18 - You should have received a copy of the GNU General Public License  
19 - along with this program. If not, see <http://www.gnu.org/licenses/>.  
20 -"""  
21 -  
22 -import expsuite  
23 -import sys  
24 -sys.path.insert(0,'../')  
25 -from config import Config  
26 -from data import PopconXapianIndex, PopconSubmission  
27 -from recommender import Recommender  
28 -from user import LocalSystem, User  
29 -from evaluation import *  
30 -import logging  
31 -import random  
32 -import Gnuplot  
33 -  
34 -class ClusteringSuite(expsuite.PyExperimentSuite):  
35 - def reset(self, params, rep):  
36 - self.cfg = Config()  
37 - self.cfg.popcon_index = "../tests/test_data/.sample_pxi"  
38 - self.cfg.popcon_dir = "../tests/test_data/popcon_dir"  
39 - self.cfg.clusters_dir = "../tests/test_data/clusters_dir"  
40 -  
41 - if params['name'] == "clustering":  
42 - logging.info("Starting 'clustering' experiments suite...")  
43 - self.cfg.index_mode = "recluster"  
44 -  
45 - def iterate(self, params, rep, n):  
46 - if params['name'] == "clustering":  
47 - logging.info("Running iteration %d" % params['medoids'][n])  
48 - self.cfg.k_medoids = params['medoids'][n]  
49 - pxi = PopconXapianIndex(self.cfg)  
50 - result = {'k_medoids': params['medoids'][n],  
51 - 'dispersion': pxi.cluster_dispersion}  
52 - else:  
53 - result = {}  
54 - return result  
55 -  
56 -class ContentBasedSuite(expsuite.PyExperimentSuite):  
57 - def reset(self, params, rep):  
58 - if params['name'].startswith("content"):  
59 - cfg = Config()  
60 - #if the index was not built yet  
61 - #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi")  
62 - cfg.axi = "data/AppAxi"  
63 - cfg.index_mode = "old"  
64 - cfg.weight = params['weight']  
65 - self.rec = Recommender(cfg)  
66 - self.rec.set_strategy(params['strategy'])  
67 - self.repo_size = self.rec.items_repository.get_doccount()  
68 - self.user = LocalSystem()  
69 - self.user.app_pkg_profile(self.rec.items_repository)  
70 - self.user.no_auto_pkg_profile()  
71 - self.sample_size = int(len(self.user.pkg_profile)*params['sample'])  
72 - # iteration should be set to 10 in config file  
73 - #self.profile_size = range(10,101,10)  
74 -  
75 - def iterate(self, params, rep, n):  
76 - if params['name'].startswith("content"):  
77 - item_score = dict.fromkeys(self.user.pkg_profile,1)  
78 - # Prepare partition  
79 - sample = {}  
80 - for i in range(self.sample_size):  
81 - key = random.choice(item_score.keys())  
82 - sample[key] = item_score.pop(key)  
83 - # Get full recommendation  
84 - user = User(item_score)  
85 - recommendation = self.rec.get_recommendation(user,self.repo_size)  
86 - # Write recall log  
87 - recall_file = "results/content/recall/%s-%s-%.2f-%d" % \  
88 - (params['strategy'],params['weight'],params['sample'],n)  
89 - output = open(recall_file,'w')  
90 - output.write("# weight=%s\n" % params['weight'])  
91 - output.write("# strategy=%s\n" % params['strategy'])  
92 - output.write("# sample=%f\n" % params['sample'])  
93 - output.write("\n%d %d %d\n" % \  
94 - (self.repo_size,len(item_score),self.sample_size))  
95 - notfound = []  
96 - ranks = []  
97 - for pkg in sample.keys():  
98 - if pkg in recommendation.ranking:  
99 - ranks.append(recommendation.ranking.index(pkg))  
100 - else:  
101 - notfound.append(pkg)  
102 - for r in sorted(ranks):  
103 - output.write(str(r)+"\n")  
104 - if notfound:  
105 - output.write("Out of recommendation:\n")  
106 - for pkg in notfound:  
107 - output.write(pkg+"\n")  
108 - output.close()  
109 - # Plot metrics summary  
110 - accuracy = []  
111 - precision = []  
112 - recall = []  
113 - f1 = []  
114 - g = Gnuplot.Gnuplot()  
115 - g('set style data lines')  
116 - g.xlabel('Recommendation size')  
117 - for size in range(1,len(recommendation.ranking)+1,100):  
118 - predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))  
119 - real = RecommendationResult(sample)  
120 - evaluation = Evaluation(predicted,real,self.repo_size)  
121 - accuracy.append([size,evaluation.run(Accuracy())])  
122 - precision.append([size,evaluation.run(Precision())])  
123 - recall.append([size,evaluation.run(Recall())])  
124 - f1.append([size,evaluation.run(F1())])  
125 - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),  
126 - Gnuplot.Data(precision,title="Precision"),  
127 - Gnuplot.Data(recall,title="Recall"),  
128 - Gnuplot.Data(f1,title="F1"))  
129 - g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1)  
130 - # Iteration log  
131 - result = {'iteration': n,  
132 - 'weight': params['weight'],  
133 - 'strategy': params['strategy'],  
134 - 'accuracy': accuracy[20],  
135 - 'precision': precision[20],  
136 - 'recall:': recall[20],  
137 - 'f1': f1[20]}  
138 - return result  
139 -  
140 -#class CollaborativeSuite(expsuite.PyExperimentSuite):  
141 -# def reset(self, params, rep):  
142 -# if params['name'].startswith("collaborative"):  
143 -#  
144 -# def iterate(self, params, rep, n):  
145 -# if params['name'].startswith("collaborative"):  
146 -# for root, dirs, files in os.walk(self.source_dir):  
147 -# for popcon_file in files:  
148 -# submission = PopconSubmission(os.path.join(root,popcon_file))  
149 -# user = User(submission.packages)  
150 -# user.maximal_pkg_profile()  
151 -# rec.get_recommendation(user)  
152 -# precision = 0  
153 -# result = {'weight': params['weight'],  
154 -# 'strategy': params['strategy'],  
155 -# 'profile_size': self.profile_size[n],  
156 -# 'accuracy': accuracy,  
157 -# 'precision': precision,  
158 -# 'recall:': recall,  
159 -# 'f1': }  
160 -# else:  
161 -# result = {}  
162 -# return result  
163 -  
164 -if __name__ == '__main__':  
165 -  
166 - if "clustering" in sys.argv or len(sys.argv)<3:  
167 - ClusteringSuite().start()  
168 - if "content" in sys.argv or len(sys.argv)<3:  
169 - ContentBasedSuite().start()  
170 - #if "collaborative" in sys.argv or len(sys.argv)<3:  
171 - #CollaborativeSuite().start()  
src/experiments/pure.py 0 → 100755
@@ -0,0 +1,199 @@ @@ -0,0 +1,199 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + profile-suite - experiment different profile sizes
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +if __name__ == '__main__':
  35 + if len(sys.argv)<2:
  36 + print "Usage: pure strategy_category sample_file"
  37 + exit(1)
  38 +
  39 + iterations = 20
  40 + profile_size = [10,20,40,60,80,100,140,170,200,240]
  41 + neighbor_size = [3,5,10,20,30,50,70,100,150,200]
  42 +
  43 + content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  44 + collaborative_strategies = ['knn_eset','knn','knn_plus']
  45 +
  46 + #iterations = 1
  47 + #profile_size = [10,20,30]
  48 + #neighbor_size = [3,5,10,20,30,50]
  49 + #content_strategies = ['cb']
  50 + #collaborative_strategies = ['knn']
  51 +
  52 + strategy_category = sys.argv[1]
  53 + if strategy_category == "content":
  54 + strategies = content_strategies
  55 + sizes = profile_size
  56 + option_str = "profile"
  57 + elif strategy_category == "collaborative":
  58 + strategies = collaborative_strategies
  59 + sizes = neighbor_size
  60 + option_str = "neighborhood"
  61 + else:
  62 + print "Usage: profile-suite strategy_category sample_file"
  63 + exit(1)
  64 +
  65 + cfg = Config()
  66 + population_sample = []
  67 + sample_file = sys.argv[2]
  68 + sample_str = sample_file.split('/')[-1]
  69 + with open(sample_file,'r') as f:
  70 + for line in f.readlines():
  71 + user_id = line.strip('\n')
  72 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  73 + sample_dir = ("results/%s/%s" %
  74 + (strategy_category,sample_str))
  75 + if not os.path.exists(sample_dir):
  76 + os.makedirs(sample_dir)
  77 +
  78 + for strategy in strategies:
  79 + cfg.strategy = strategy
  80 + p_10_summary = {}
  81 + f05_100_summary = {}
  82 + c_10 = {}
  83 + c_100 = {}
  84 +
  85 + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
  86 + graph_10 = log_file+"-10.png"
  87 + graph_100 = log_file+"-100.png"
  88 + graph_10_jpg = graph_10.strip(".png")+".jpg"
  89 + graph_100_jpg = graph_100.strip(".png")+".jpg"
  90 + comment_10 = graph_10_jpg+".comment"
  91 + comment_100 = graph_100_jpg+".comment"
  92 +
  93 + with open(comment_10,'w') as f:
  94 + f.write("# sample %s\n" % sample_str)
  95 + f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
  96 + (cfg.strategy,iterations))
  97 + f.write("# %s\tmean_p_10\tdev_p_10\tc_10\n\n"%option_str)
  98 + with open(comment_100,'w') as f:
  99 + f.write("# sample %s\n" % sample_str)
  100 + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
  101 + (cfg.strategy,iterations))
  102 + f.write("# %s\t\tmean_f05_100\t\tdev_f05_100\t\tc_100\n\n"%option_str)
  103 +
  104 + for size in sizes:
  105 + c_10[size] = set()
  106 + c_100[size] = set()
  107 + p_10_summary[size] = []
  108 + f05_100_summary[size] = []
  109 + with open(log_file+"-%s%.3d"%(option_str,size),'w') as f:
  110 + f.write("# sample %s\n" % sample_str)
  111 + f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size))
  112 + f.write("# p_10\tf05_100\n\n")
  113 +
  114 + # main loop per user
  115 + for submission_file in population_sample:
  116 + user = PopconSystem(submission_file)
  117 + user.filter_pkg_profile(cfg.pkgs_filter)
  118 + user.maximal_pkg_profile()
  119 + for size in sizes:
  120 + cfg.profile_size = size
  121 + cfg.k_neighbors = size
  122 + rec = Recommender(cfg)
  123 + repo_size = rec.items_repository.get_doccount()
  124 + p_10 = []
  125 + f05_100 = []
  126 + for n in range(iterations):
  127 + # Fill sample profile
  128 + profile_len = len(user.pkg_profile)
  129 + item_score = {}
  130 + for pkg in user.pkg_profile:
  131 + item_score[pkg] = user.item_score[pkg]
  132 + sample = {}
  133 + sample_size = int(profile_len*0.9)
  134 + for i in range(sample_size):
  135 + key = random.choice(item_score.keys())
  136 + sample[key] = item_score.pop(key)
  137 + iteration_user = User(item_score)
  138 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  139 + if hasattr(recommendation,"ranking"):
  140 + ranking = recommendation.ranking
  141 + real = RecommendationResult(sample)
  142 + predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
  143 + evaluation = Evaluation(predicted_10,real,repo_size)
  144 + p_10.append(evaluation.run(Precision()))
  145 + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
  146 + evaluation = Evaluation(predicted_100,real,repo_size)
  147 + f05_100.append(evaluation.run(F_score(0.5)))
  148 + c_10[size] = c_10[size].union(recommendation.ranking[:10])
  149 + c_100[size] = c_100[size].union(recommendation.ranking[:100])
  150 + # save summary
  151 + if p_10:
  152 + p_10_summary[size].append(numpy.mean(p_10))
  153 + if f05_100:
  154 + f05_100_summary[size].append(numpy.mean(f05_100))
  155 +
  156 + with open(log_file+"-%s%.3d"%(option_str,size),'a') as f:
  157 + f.write("%.4f \t%.4f\n" % (numpy.mean(p_10),numpy.mean(f05_100)))
  158 +
  159 + # back to main flow
  160 + coverage_10 = {}
  161 + coverage_100 = {}
  162 + with open(comment_10,'a') as f:
  163 + for size in sizes:
  164 + coverage_10[size] = len(c_10[size])/float(repo_size)
  165 + f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  166 + (size,numpy.mean(p_10_summary[size]),numpy.std(p_10_summary[size]),coverage_10[size]))
  167 + with open(comment_100,'a') as f:
  168 + for size in sizes:
  169 + coverage_100[size] = len(c_100[size])/float(repo_size)
  170 + f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  171 + (size,numpy.mean(f05_100_summary[size]),numpy.std(f05_100_summary[size]),coverage_100[size]))
  172 +
  173 + # plot results summary
  174 + g = Gnuplot.Gnuplot()
  175 + g('set style data lines')
  176 + g('set yrange [0:1.0]')
  177 + g.xlabel('%s size'%option_str.capitalize())
  178 + g.title("Setup: %s (threshold 10)" % cfg.strategy)
  179 + g.plot(Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
  180 + for k in p_10_summary.keys()]),title="Precision"),
  181 + Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
  182 + for k in p_10_summary.keys()]),title="Deviation",
  183 + with_="yerrorbar lt 2 pt 6"),
  184 + Gnuplot.Data(sorted([[k,coverage_10[k]]
  185 + for k in coverage_10.keys()]),title="Coverage"))
  186 + g.hardcopy(graph_10,terminal="png")
  187 + g = Gnuplot.Gnuplot()
  188 + g('set style data lines')
  189 + g('set yrange [0:1.0]')
  190 + g.xlabel('%s size'%option_str.capitalize())
  191 + g.title("Setup: %s (threshold 100)" % cfg.strategy)
  192 + g.plot(Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
  193 + for k in f05_100_summary.keys()]),title="F05"),
  194 + Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
  195 + for k in f05_100_summary.keys()]),title="Deviation",
  196 + with_="yerrorbar lt 2 pt 6"),
  197 + Gnuplot.Data(sorted([[k,coverage_100[k]]
  198 + for k in coverage_100.keys()]),title="Coverage"))
  199 + g.hardcopy(graph_100,terminal="png")
src/experiments/roc-sample.py 0 → 100755
@@ -0,0 +1,240 @@ @@ -0,0 +1,240 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +import shutil
  34 +
  35 +def plot_roc(results,log_file,mean=0):
  36 + g = Gnuplot.Gnuplot()
  37 + g('set style data lines')
  38 + g.xlabel('False Positive Rate')
  39 + g.ylabel('True Positive Rate')
  40 + g('set xrange [0:1.0]')
  41 + g('set yrange [0:1.0]')
  42 + g.title("Setup: %s" % log_file.split("/")[-1])
  43 + g('set label "C %.4f" at 0.68,0.2' % results.coverage())
  44 + g('set label "AUC %.4f" at 0.68,0.15' % results.get_auc())
  45 + g('set label "P(10) %.2f +- %.2f" at 0.68,0.10' % (numpy.mean(results.precision[10]),numpy.std(results.precision[10])))
  46 + g('set label "F05(100) %.2f +- %.2f" at 0.68,0.05' % (numpy.mean(results.f05[100]),numpy.std(results.f05[100])))
  47 + if mean==1:
  48 + g.plot(Gnuplot.Data(results.get_roc_points(),title="mean ROC"),
  49 + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
  50 + g.hardcopy(log_file+"-roc-mean.png",terminal="png")
  51 + g.hardcopy(log_file+"-roc-mean.ps",terminal="postscript",enhanced=1,color=1)
  52 + else:
  53 + g.plot(Gnuplot.Data(results.get_roc_points(),title="ROC",with_="xyerrorbars"),
  54 + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
  55 + g.hardcopy(log_file+"-roc.png",terminal="png")
  56 + g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
  57 +
  58 +def get_label(cfg):
  59 + label = {}
  60 + if cfg.strategy in content_based:
  61 + label["description"] = "strategy-profile"
  62 + label["values"] = ("%s-profile%.3d" %
  63 + (cfg.strategy,cfg.profile_size))
  64 + elif cfg.strategy in collaborative:
  65 + label["description"] = "strategy-knn"
  66 + label["values"] = ("%s-k%.3d" %
  67 + (cfg.strategy,cfg.k_neighbors))
  68 + elif cfg.strategy in hybrid:
  69 + label["description"] = "strategy-knn-profile"
  70 + label["values"] = ("%s-k%.3d-profile%.3d" %
  71 + (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
  72 + return label
  73 +
  74 +class ExperimentResults:
  75 + def __init__(self,repo_size):
  76 + self.repository_size = repo_size
  77 + self.precision = {}
  78 + self.recall = {}
  79 + self.fpr = {}
  80 + self.f05 = {}
  81 + self.recommended = {}
  82 + self.thresholds = [1]+range(10,self.repository_size,10)
  83 + for size in self.thresholds:
  84 + self.precision[size] = []
  85 + self.recall[size] = []
  86 + self.fpr[size] = []
  87 + self.f05[size] = []
  88 + self.recommended[size] = set()
  89 +
  90 + def add_result(self,ranking,sample):
  91 + for size in self.thresholds:
  92 + recommendation = ranking[:size]
  93 + self.recommended[size] = self.recommended[size].union(recommendation)
  94 + predicted = RecommendationResult(dict.fromkeys(recommendation,1))
  95 + real = RecommendationResult(sample)
  96 + evaluation = Evaluation(predicted,real,self.repository_size)
  97 + self.precision[size].append(evaluation.run(Precision()))
  98 + self.recall[size].append(evaluation.run(Recall()))
  99 + self.f05[size].append(evaluation.run(F_score(0.5)))
  100 + self.fpr[size].append(evaluation.run(FPR()))
  101 +
  102 + def precision_summary(self):
  103 + return [[size,numpy.mean(self.precision[size])] for size in self.thresholds]
  104 +
  105 + def recall_summary(self):
  106 + return [[size,numpy.mean(self.recall[size])] for size in self.thresholds]
  107 +
  108 + def f05_summary(self):
  109 + return [[size,numpy.mean(self.f05[size])] for size in self.thresholds]
  110 +
  111 + def coverage_summary(self):
  112 + return [[size,self.coverage(size)] for size in self.thresholds]
  113 +
  114 + def coverage(self,size=0):
  115 + if not size:
  116 + size = self.thresholds[-1]
  117 + return len(self.recommended[size])/float(self.repository_size)
  118 +
  119 + def precision(self,size):
  120 + return numpy.mean(results.precision[size])
  121 +
  122 + def get_auc(self):
  123 + roc_points = self.get_roc_points()
  124 + x_roc = [p[0] for p in roc_points]
  125 + y_roc = [p[1] for p in roc_points]
  126 + x_roc.insert(0,0)
  127 + y_roc.insert(0,0)
  128 + x_roc.append(1)
  129 + y_roc.append(1)
  130 + return numpy.trapz(y=y_roc, x=x_roc)
  131 +
  132 + # Average ROC by threshold (= size of recommendation)
  133 + def get_roc_points(self):
  134 + points = []
  135 + for size in self.recall.keys():
  136 + tpr = self.recall[size]
  137 + fpr = self.fpr[size]
  138 + points.append([numpy.mean(fpr),numpy.mean(tpr),numpy.std(fpr),numpy.std(tpr)])
  139 + return sorted(points)
  140 +
  141 +def run_strategy(cfg,sample_file):
  142 + rec = Recommender(cfg)
  143 + repo_size = rec.items_repository.get_doccount()
  144 + results = ExperimentResults(repo_size)
  145 + label = get_label(cfg)
  146 + population_sample = []
  147 + sample_str = sample_file.split('/')[-1]
  148 + with open(sample_file,'r') as f:
  149 + for line in f.readlines():
  150 + user_id = line.strip('\n')
  151 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  152 + sample_dir = ("results/roc-sample/%s" % sample_str)
  153 + if not os.path.exists(sample_dir):
  154 + os.makedirs(sample_dir)
  155 + log_file = os.path.join(sample_dir,label["values"])
  156 +
  157 + # n iterations per population user
  158 + for submission_file in population_sample:
  159 + user = PopconSystem(submission_file)
  160 + user.filter_pkg_profile(cfg.pkgs_filter)
  161 + user.maximal_pkg_profile()
  162 + for n in range(iterations):
  163 + # Fill sample profile
  164 + profile_len = len(user.pkg_profile)
  165 + item_score = {}
  166 + for pkg in user.pkg_profile:
  167 + item_score[pkg] = user.item_score[pkg]
  168 + sample = {}
  169 + sample_size = int(profile_len*0.9)
  170 + for i in range(sample_size):
  171 + key = random.choice(item_score.keys())
  172 + sample[key] = item_score.pop(key)
  173 + iteration_user = User(item_score)
  174 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  175 + if hasattr(recommendation,"ranking"):
  176 + results.add_result(recommendation.ranking,sample)
  177 +
  178 + plot_roc(results,log_file)
  179 + plot_roc(results,log_file,1)
  180 + with open(log_file+"-roc.jpg.comment",'w') as f:
  181 + f.write("# %s\n# %s\n\n" %
  182 + (label["description"],label["values"]))
  183 + f.write("# roc AUC\n%.4f\n\n"%results.get_auc())
  184 + f.write("# threshold\tmean_fpr\tdev_fpr\t\tmean_tpr\tdev_tpr\t\tcoverage\n")
  185 + for size in results.thresholds:
  186 + f.write("%4d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  187 + (size,numpy.mean(results.fpr[size]),
  188 + numpy.std(results.fpr[size]),
  189 + numpy.mean(results.recall[size]),
  190 + numpy.std(results.recall[size]),
  191 + numpy.mean(results.coverage(size))))
  192 +
  193 +def run_content(cfg,sample_file):
  194 + for size in profile_size:
  195 + cfg.profile_size = size
  196 + run_strategy(cfg,sample_file)
  197 +
  198 +def run_collaborative(cfg,sample_file):
  199 + for k in neighbors:
  200 + cfg.k_neighbors = k
  201 + run_strategy(cfg,sample_file)
  202 +
  203 +def run_hybrid(cfg,sample_file):
  204 + for k in neighbors:
  205 + cfg.k_neighbors = k
  206 + for size in profile_size:
  207 + cfg.profile_size = size
  208 + run_strategy(cfg,sample_file)
  209 +
  210 +if __name__ == '__main__':
  211 + if len(sys.argv)<2:
  212 + print "Usage: sample-roc strategy_str [popcon_sample_path]"
  213 + exit(1)
  214 +
  215 + #iterations = 3
  216 + #content_based = ['cb']
  217 + #collaborative = ['knn_eset']
  218 + #hybrid = ['knnco']
  219 + #profile_size = [50,100]
  220 + #neighbors = [50]
  221 + iterations = 20
  222 + content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  223 + collaborative = ['knn_eset','knn','knn_plus']
  224 + hybrid = ['knnco','knnco_eset']
  225 + profile_size = [10,20,50,100,200]
  226 + neighbors = [200]
  227 + #neighbors = [3,10,50,100,200]
  228 + #profile_size = [10,20,40,60,80,100,140,170,200,240]
  229 + #neighbors = [3,5,10,20,30,50,70,100,150,200]
  230 +
  231 + cfg = Config()
  232 + cfg.strategy = sys.argv[1]
  233 + sample_file = sys.argv[2]
  234 +
  235 + if cfg.strategy in content_based:
  236 + run_content(cfg,sample_file)
  237 + if cfg.strategy in collaborative:
  238 + run_collaborative(cfg,sample_file)
  239 + if cfg.strategy in hybrid:
  240 + run_hybrid(cfg,sample_file)
src/experiments/roc-single.py 0 → 100755
@@ -0,0 +1,269 @@ @@ -0,0 +1,269 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +import shutil
  34 +
  35 +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
  36 + # Write recall log
  37 + output = open(("%s-%.2d" % (log_file,n)),'w')
  38 + output.write("# %s-n\n" % label["description"])
  39 + output.write("# %s-%.2d\n" % (label["values"],n))
  40 + output.write("\n# repository profile sample\n%d %d %d\n" % \
  41 + (repo_size,profile_size,len(sample)))
  42 + if hasattr(recommendation,"ranking"):
  43 + notfound = []
  44 + ranks = []
  45 + for pkg in sample.keys():
  46 + if pkg in recommendation.ranking:
  47 + ranks.append(recommendation.ranking.index(pkg))
  48 + else:
  49 + notfound.append(pkg)
  50 + for r in sorted(ranks):
  51 + output.write(str(r)+"\n")
  52 + if notfound:
  53 + output.write("# out of recommendation:\n")
  54 + for pkg in notfound:
  55 + output.write(pkg+"\n")
  56 + output.close()
  57 +
  58 +def plot_summary(results,log_file):
  59 + # Plot metrics summary
  60 + g = Gnuplot.Gnuplot()
  61 + g('set style data lines')
  62 + g('set yrange [0:1.0]')
  63 + g.xlabel('Threshold (recommendation size)')
  64 + g.title("Setup: %s" % log_file.split("/")[-1])
  65 + g.plot(Gnuplot.Data(results.precision_summary(),title="Precision"),
  66 + Gnuplot.Data(results.recall_summary(),title="Recall"),
  67 + Gnuplot.Data(results.f05_summary(),title="F05"),
  68 + Gnuplot.Data(results.coverage_summary(),title="Coverage"))
  69 + g.hardcopy(log_file+".png",terminal="png")
  70 + g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
  71 + g('set logscale x')
  72 + g('replot')
  73 + g.hardcopy(log_file+"-logscale.png",terminal="png")
  74 + g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
  75 +
  76 +def plot_roc(results,log_file):
  77 + g = Gnuplot.Gnuplot()
  78 + g('set style data lines')
  79 + g.xlabel('False Positive Rate')
  80 + g.ylabel('True Positive Rate')
  81 + g('set xrange [0:1.0]')
  82 + g('set yrange [0:1.0]')
  83 + g.title("Setup: %s" % log_file.split("/")[-1])
  84 + g('set label "C %.2f" at 0.8,0.25' % results.coverage())
  85 + g('set label "AUC %.2f" at 0.8,0.2' % results.get_auc())
  86 + g('set label "P(10) %.2f" at 0.8,0.15' % numpy.mean(results.precision[10]))
  87 + g('set label "P(20) %.2f" at 0.8,0.10' % numpy.mean(results.precision[20]))
  88 + g('set label "F05(100) %.2f" at 0.8,0.05' % numpy.mean(results.f05[100]))
  89 + g.plot(Gnuplot.Data(results.get_roc_points(),title="ROC"),
  90 + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
  91 + #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
  92 + g.hardcopy(log_file+"-roc.png",terminal="png")
  93 + g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
  94 +
  95 +def get_label(cfg):
  96 + label = {}
  97 + if cfg.strategy in content_based:
  98 + label["description"] = "strategy-profile"
  99 + label["values"] = ("%s-profile%.3d" %
  100 + (cfg.strategy,cfg.profile_size))
  101 + elif cfg.strategy in collaborative:
  102 + label["description"] = "strategy-knn"
  103 + label["values"] = ("%s-k%.3d" %
  104 + (cfg.strategy,cfg.k_neighbors))
  105 + elif cfg.strategy in hybrid:
  106 + label["description"] = "strategy-knn-profile"
  107 + label["values"] = ("%s-k%.3d-profile%.3d" %
  108 + (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
  109 + return label
  110 +
  111 +class ExperimentResults:
  112 + def __init__(self,repo_size):
  113 + self.repository_size = repo_size
  114 + self.precision = {}
  115 + self.recall = {}
  116 + self.fpr = {}
  117 + self.f05 = {}
  118 + self.recommended = {}
  119 + self.thresholds = [1]+range(10,self.repository_size,10)
  120 + for size in self.thresholds:
  121 + self.precision[size] = []
  122 + self.recall[size] = []
  123 + self.fpr[size] = []
  124 + self.f05[size] = []
  125 + self.recommended[size] = set()
  126 +
  127 + def add_result(self,ranking,sample):
  128 + for size in self.thresholds:
  129 + recommendation = ranking[:size]
  130 + self.recommended[size] = self.recommended[size].union(recommendation)
  131 + predicted = RecommendationResult(dict.fromkeys(recommendation,1))
  132 + real = RecommendationResult(sample)
  133 + evaluation = Evaluation(predicted,real,self.repository_size)
  134 + print evaluation.run(Precision())
  135 + self.precision[size].append(evaluation.run(Precision()))
  136 + self.recall[size].append(evaluation.run(Recall()))
  137 + self.f05[size].append(evaluation.run(F_score(0.5)))
  138 + self.fpr[size].append(evaluation.run(FPR()))
  139 +
  140 + def precision_summary(self):
  141 + return [[size,numpy.mean(self.precision[size])] for size in self.thresholds]
  142 +
  143 + def recall_summary(self):
  144 + return [[size,numpy.mean(self.recall[size])] for size in self.thresholds]
  145 +
  146 + def f05_summary(self):
  147 + return [[size,numpy.mean(self.f05[size])] for size in self.thresholds]
  148 +
  149 + def coverage_summary(self):
  150 + return [[size,self.coverage(size)] for size in self.thresholds]
  151 +
  152 + def coverage(self,size=0):
  153 + if not size:
  154 + size = self.thresholds[-1]
  155 + return len(self.recommended[size])/float(self.repository_size)
  156 +
  157 + def precision(self,size):
  158 + return numpy.mean(results.precision[size])
  159 +
  160 + def get_auc(self):
  161 + roc_points = self.get_roc_points()
  162 + x_roc = [p[0] for p in roc_points]
  163 + y_roc = [p[1] for p in roc_points]
  164 + x_roc.insert(0,0)
  165 + y_roc.insert(0,0)
  166 + x_roc.append(1)
  167 + y_roc.append(1)
  168 + return numpy.trapz(y=y_roc, x=x_roc)
  169 +
  170 + # Average ROC by threshold (= size of recommendation)
  171 + def get_roc_points(self):
  172 + points = []
  173 + for size in self.recall.keys():
  174 + tpr = self.recall[size]
  175 + fpr = self.fpr[size]
  176 + points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
  177 + return sorted(points)
  178 +
  179 +def run_strategy(cfg,user):
  180 + rec = Recommender(cfg)
  181 + repo_size = rec.items_repository.get_doccount()
  182 + results = ExperimentResults(repo_size)
  183 + label = get_label(cfg)
  184 + user_dir = ("results/roc-suite/%s/%s" % (user.user_id[:8],cfg.strategy))
  185 + if not os.path.exists(user_dir):
  186 + os.makedirs(user_dir)
  187 + log_file = os.path.join(user_dir,label["values"])
  188 + for n in range(iterations):
  189 + # Fill sample profile
  190 + profile_len = len(user.pkg_profile)
  191 + item_score = {}
  192 + for pkg in user.pkg_profile:
  193 + item_score[pkg] = user.item_score[pkg]
  194 + sample = {}
  195 + sample_size = int(profile_len*0.9)
  196 + for i in range(sample_size):
  197 + key = random.choice(item_score.keys())
  198 + sample[key] = item_score.pop(key)
  199 + iteration_user = User(item_score)
  200 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  201 + write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
  202 + if hasattr(recommendation,"ranking"):
  203 + results.add_result(recommendation.ranking,sample)
  204 + with open(log_file+"-roc.jpg.comment",'w') as f:
  205 + f.write("# %s\n# %s\n\n" %
  206 + (label["description"],label["values"]))
  207 + f.write("# roc AUC\n%.4f\n\n"%results.get_auc())
  208 + f.write("# threshold\tprecision\trecall\t\tf05\t\tcoverage\n")
  209 + for size in results.thresholds:
  210 + f.write("%4d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  211 + (size,numpy.mean(results.precision[size]),
  212 + numpy.mean(results.recall[size]),
  213 + numpy.mean(results.f05[size]),
  214 + numpy.mean(results.coverage(size))))
  215 + shutil.copy(log_file+"-roc.jpg.comment",log_file+".jpg.comment")
  216 + shutil.copy(log_file+"-roc.jpg.comment",log_file+"-logscale.jpg.comment")
  217 + plot_roc(results,log_file)
  218 + plot_summary(results,log_file)
  219 +
  220 +def run_content(user,cfg):
  221 + for size in profile_size:
  222 + cfg.profile_size = size
  223 + run_strategy(cfg,user)
  224 +
  225 +def run_collaborative(user,cfg):
  226 + for k in neighbors:
  227 + cfg.k_neighbors = k
  228 + run_strategy(cfg,user)
  229 +
  230 +def run_hybrid(user,cfg):
  231 + for k in neighbors:
  232 + cfg.k_neighbors = k
  233 + for size in profile_size:
  234 + cfg.profile_size = size
  235 + run_strategy(cfg,user)
  236 +
  237 +if __name__ == '__main__':
  238 + if len(sys.argv)<2:
  239 + print "Usage: roc-suite strategy_str [popcon_submission_path]"
  240 + exit(1)
  241 +
  242 + #iterations = 3
  243 + #content_based = ['cb']
  244 + #collaborative = ['knn_eset']
  245 + #hybrid = ['knnco']
  246 + #profile_size = [50,100]
  247 + #neighbors = [50]
  248 + iterations = 20
  249 + content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  250 + collaborative = ['knn_eset','knn','knn_plus']
  251 + hybrid = ['knnco','knnco_eset']
  252 + profile_size = [10,20,40,60,80,100,140,170,200,240]
  253 + neighbors = [3,5,10,20,30,50,70,100,150,200]
  254 +
  255 + cfg = Config()
  256 + cfg.strategy = sys.argv[1]
  257 +
  258 + #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
  259 + user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
  260 + #user = PopconSystem(sys.argv[1])
  261 + user.filter_pkg_profile(cfg.pkgs_filter)
  262 + user.maximal_pkg_profile()
  263 +
  264 + if cfg.strategy in content_based:
  265 + run_content(user,cfg)
  266 + if cfg.strategy in collaborative:
  267 + run_collaborative(user,cfg)
  268 + if cfg.strategy in hybrid:
  269 + run_hybrid(user,cfg)
src/experiments/roc-suite.py
@@ -1,328 +0,0 @@ @@ -1,328 +0,0 @@
1 -#!/usr/bin/env python  
2 -"""  
3 - recommender suite - recommender experiments suite  
4 -"""  
5 -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"  
6 -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"  
7 -__license__ = """  
8 - This program is free software: you can redistribute it and/or modify  
9 - it under the terms of the GNU General Public License as published by  
10 - the Free Software Foundation, either version 3 of the License, or  
11 - (at your option) any later version.  
12 -  
13 - This program is distributed in the hope that it will be useful,  
14 - but WITHOUT ANY WARRANTY; without even the implied warranty of  
15 - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
16 - GNU General Public License for more details.  
17 -  
18 - You should have received a copy of the GNU General Public License  
19 - along with this program. If not, see <http://www.gnu.org/licenses/>.  
20 -"""  
21 -  
22 -import sys  
23 -sys.path.insert(0,'../')  
24 -from config import Config  
25 -from data import PopconXapianIndex, PopconSubmission  
26 -from recommender import Recommender  
27 -from user import LocalSystem, User  
28 -from evaluation import *  
29 -import logging  
30 -import random  
31 -import Gnuplot  
32 -import numpy  
33 -  
34 -#iterations = 3  
35 -#sample_proportions = [0.9]  
36 -#weighting = [('bm25',1.2)]  
37 -#collaborative = ['knn_eset']  
38 -#content_based = ['cb']  
39 -#hybrid = ['knnco']  
40 -#profile_size = [50,100]  
41 -#popcon_size = ["1000"]  
42 -#neighbors = [50]  
43 -  
44 -iterations = 30  
45 -sample_proportions = [0.9]  
46 -weighting = [('bm25',1.0),('bm25',1.2),('bm25',2.0),('trad',0)]  
47 -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']  
48 -collaborative = ['knn_eset','knn','knn_plus']  
49 -hybrid = ['knnco','knnco_eset']  
50 -profile_size = range(20,200,20)  
51 -neighbors = range(10,510,50)  
52 -  
53 -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):  
54 - # Write recall log  
55 - output = open(("%s-%.2d" % (log_file,n)),'w')  
56 - output.write("# %s-n\n" % label["description"])  
57 - output.write("# %s-%.2d\n" % (label["values"],n))  
58 - output.write("\n# repository profile sample\n%d %d %d\n" % \  
59 - (repo_size,profile_size,len(sample)))  
60 - if hasattr(recommendation,"ranking"):  
61 - notfound = []  
62 - ranks = []  
63 - for pkg in sample.keys():  
64 - if pkg in recommendation.ranking:  
65 - ranks.append(recommendation.ranking.index(pkg))  
66 - else:  
67 - notfound.append(pkg)  
68 - for r in sorted(ranks):  
69 - output.write(str(r)+"\n")  
70 - if notfound:  
71 - output.write("# out of recommendation:\n")  
72 - for pkg in notfound:  
73 - output.write(pkg+"\n")  
74 - output.close()  
75 -  
76 -def plot_roc(roc_points,auc,eauc,c,p,log_file):  
77 - g = Gnuplot.Gnuplot()  
78 - g('set style data lines')  
79 - g.xlabel('False Positive Rate')  
80 - g.ylabel('True Positive Rate')  
81 - g('set xrange [0:1.0]')  
82 - g('set yrange [0:1.0]')  
83 - g.title("Setup: %s" % log_file.split("/")[-1])  
84 - g('set label "C %.2f" at 0.8,0.25' % c)  
85 - g('set label "P(20) %.2f" at 0.8,0.2' % p)  
86 - g('set label "AUC %.4f" at 0.8,0.15' % auc)  
87 - g('set label "EAUC %.4f" at 0.8,0.1' % eauc)  
88 - g.plot(Gnuplot.Data(roc_points,title="ROC"),  
89 - Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),  
90 - Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))  
91 - g.hardcopy(log_file+"-roc.png",terminal="png")  
92 - g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)  
93 -  
94 -def plot_summary(precision,recall,f1,f05,accuracy,log_file):  
95 - # Plot metrics summary  
96 - g = Gnuplot.Gnuplot()  
97 - g('set style data lines')  
98 - g.xlabel('Recommendation size')  
99 - g.title("Setup: %s" % log_file.split("/")[-1])  
100 - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),  
101 - Gnuplot.Data(precision,title="Precision"),  
102 - Gnuplot.Data(recall,title="Recall"),  
103 - Gnuplot.Data(f1,title="F_1"),  
104 - Gnuplot.Data(f05,title="F_0.5"))  
105 - g.hardcopy(log_file+".png",terminal="png")  
106 - g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)  
107 - g('set logscale x')  
108 - g('replot')  
109 - g.hardcopy(log_file+"-logscale.png",terminal="png")  
110 - g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)  
111 -  
112 -def get_label(cfg,sample_proportion):  
113 - label = {}  
114 - if cfg.strategy in content_based:  
115 - label["description"] = "strategy-filter-profile-k1_bm25"  
116 - label["values"] = ("%s-profile%.3d-%s-kbm%.1f" %  
117 - (cfg.strategy,cfg.profile_size,  
118 - cfg.pkgs_filter.split("/")[-1],  
119 - cfg.bm25_k1))  
120 - elif cfg.strategy in collaborative:  
121 - label["description"] = "strategy-knn-filter-k1_bm25"  
122 - label["values"] = ("%s-k%.3d-%s-kbm%.1f" %  
123 - (cfg.strategy,cfg.k_neighbors,  
124 - cfg.pkgs_filter.split("/")[-1],  
125 - cfg.bm25_k1))  
126 - elif cfg.strategy in hybrid:  
127 - label["description"] = "strategy-knn-filter-profile-k1_bm25"  
128 - label["values"] = ("%s-k%.3d-profile%.3d-%s-kbm%.1f" %  
129 - (cfg.strategy,cfg.k_neighbors,cfg.profile_size,  
130 - cfg.pkgs_filter.split("/")[-1],  
131 - cfg.bm25_k1))  
132 - else:  
133 - print "Unknown strategy"  
134 - return label  
135 -  
136 -class ExperimentResults:  
137 - def __init__(self,repo_size):  
138 - self.repository_size = repo_size  
139 - self.accuracy = {}  
140 - self.precision = {}  
141 - self.recall = {}  
142 - self.f1 = {}  
143 - self.f05 = {}  
144 - self.fpr = {}  
145 - #points = [1]+range(10,200,10)+range(200,self.repository_size,100)  
146 - points = [1]+range(10,self.repository_size,10)  
147 - self.recommended = set()  
148 - for size in points:  
149 - self.accuracy[size] = []  
150 - self.precision[size] = []  
151 - self.recall[size] = []  
152 - self.f1[size] = []  
153 - self.f05[size] = []  
154 - self.fpr[size] = []  
155 -  
156 - def add_result(self,ranking,sample):  
157 - print "len_recommended", len(self.recommended)  
158 - print "len_rank", len(ranking)  
159 - self.recommended = self.recommended.union(ranking)  
160 - print "len_recommended", len(self.recommended)  
161 - # get data only for point  
162 - for size in self.accuracy.keys():  
163 - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))  
164 - real = RecommendationResult(sample)  
165 - evaluation = Evaluation(predicted,real,self.repository_size)  
166 - #self.accuracy[size].append(evaluation.run(Accuracy()))  
167 - self.precision[size].append(evaluation.run(Precision()))  
168 - self.recall[size].append(evaluation.run(Recall()))  
169 - #self.f1[size].append(evaluation.run(F_score(1)))  
170 - #self.f05[size].append(evaluation.run(F_score(0.5)))  
171 - self.fpr[size].append(evaluation.run(FPR()))  
172 -  
173 - # Average ROC by threshold (whici is the size)  
174 - def get_roc_points(self):  
175 - points = []  
176 - for size in self.recall.keys():  
177 - tpr = self.recall[size]  
178 - fpr = self.fpr[size]  
179 - points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])  
180 - return sorted(points)  
181 -  
182 - def get_precision_summary(self):  
183 - summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]  
184 - return sorted(summary)  
185 -  
186 - def get_recall_summary(self):  
187 - summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]  
188 - return sorted(summary)  
189 -  
190 - def get_f1_summary(self):  
191 - summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]  
192 - return sorted(summary)  
193 -  
194 - def get_f05_summary(self):  
195 - summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]  
196 - return sorted(summary)  
197 -  
198 - def get_accuracy_summary(self):  
199 - summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]  
200 - return sorted(summary)  
201 -  
202 - def best_precision(self):  
203 - size = max(self.precision, key = lambda x: max(self.precision[x]) and x>10)  
204 - return (size,max(self.precision[size]))  
205 -  
206 - def best_f1(self):  
207 - size = max(self.f1, key = lambda x: max(self.f1[x]))  
208 - return (size,max(self.f1[size]))  
209 -  
210 - def best_f05(self):  
211 - size = max(self.f05, key = lambda x: max(self.f05[x]))  
212 - return (size,max(self.f05[size]))  
213 -  
214 -def run_strategy(cfg,user):  
215 - for weight in weighting:  
216 - cfg.weight = weight[0]  
217 - cfg.bm25_k1 = weight[1]  
218 - rec = Recommender(cfg)  
219 - repo_size = rec.items_repository.get_doccount()  
220 - for proportion in sample_proportions:  
221 - results = ExperimentResults(repo_size)  
222 - label = get_label(cfg,proportion)  
223 - #log_file = "results/20110906/4a67a295/"+label["values"]  
224 - log_file = "results/"+label["values"]  
225 - for n in range(iterations):  
226 - # Fill sample profile  
227 - profile_size = len(user.pkg_profile)  
228 - item_score = {}  
229 - for pkg in user.pkg_profile:  
230 - item_score[pkg] = user.item_score[pkg]  
231 - sample = {}  
232 - sample_size = int(profile_size*proportion)  
233 - for i in range(sample_size):  
234 - key = random.choice(item_score.keys())  
235 - sample[key] = item_score.pop(key)  
236 - iteration_user = User(item_score)  
237 - recommendation = rec.get_recommendation(iteration_user,repo_size)  
238 - #write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)  
239 - if hasattr(recommendation,"ranking"):  
240 - results.add_result(recommendation.ranking,sample)  
241 - with open(log_file,'w') as f:  
242 - roc_points = results.get_roc_points()  
243 - x_coord = [p[0] for p in roc_points]  
244 - y_coord = [p[1] for p in roc_points]  
245 - auc = numpy.trapz(y=y_coord, x=x_coord)  
246 - eauc = (auc+  
247 - numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+  
248 - numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))  
249 - precision_20 = sum(results.precision[10])/len(results.precision[10])  
250 - print results.recommended  
251 - print "len",len(results.recommended)  
252 - coverage = len(results.recommended)/float(repo_size)  
253 - print "repo_size: ", float(repo_size)  
254 - print coverage  
255 - exit(1)  
256 - #f1_10 = sum(results.f1[10])/len(results.f1[10])  
257 - #f05_10 = sum(results.f05[10])/len(results.f05[10])  
258 - f.write("# %s\n# %s\n\n" %  
259 - (label["description"],label["values"]))  
260 - f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %  
261 - (coverage,precision_20,auc,eauc))  
262 - #f.write("# best results (recommendation size; metric)\n")  
263 - #f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %  
264 - # (results.best_precision()[0],results.best_precision()[1],  
265 - # results.best_f1()[0],results.best_f1()[1],  
266 - # results.best_f05()[0],results.best_f05()[1]))  
267 - #f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %  
268 - # (precision_10,f1_10,f05_10))  
269 - #precision = results.get_precision_summary()  
270 - #recall = results.get_recall_summary()  
271 - #f1 = results.get_f1_summary()  
272 - #f05 = results.get_f05_summary()  
273 - #accuracy = results.get_accuracy_summary()  
274 - #plot_summary(precision,recall,f1,f05,accuracy,log_file)  
275 - plot_roc(roc_points,auc,eauc,coverage,precision_20,log_file)  
276 -  
277 -def run_content(user,cfg):  
278 - for strategy in content_based:  
279 - cfg.strategy = strategy  
280 - for size in profile_size:  
281 - cfg.profile_size = size  
282 - run_strategy(cfg,user)  
283 -  
284 -def run_collaborative(user,cfg):  
285 - popcon_desktopapps = cfg.popcon_desktopapps  
286 - popcon_programs = cfg.popcon_programs  
287 - for strategy in collaborative:  
288 - cfg.strategy = strategy  
289 - for k in neighbors:  
290 - cfg.k_neighbors = k  
291 - #for size in popcon_size:  
292 - # if size:  
293 - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size  
294 - # cfg.popcon_programs = popcon_programs+"_"+size  
295 - run_strategy(cfg,user)  
296 -  
297 -def run_hybrid(user,cfg):  
298 - popcon_desktopapps = cfg.popcon_desktopapps  
299 - popcon_programs = cfg.popcon_programs  
300 - for strategy in hybrid:  
301 - cfg.strategy = strategy  
302 - for k in neighbors:  
303 - cfg.k_neighbors = k  
304 - #for size in popcon_size:  
305 - # if size:  
306 - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size  
307 - # cfg.popcon_programs = popcon_programs+"_"+size  
308 - for size in profile_size:  
309 - cfg.profile_size = size  
310 - run_strategy(cfg,user)  
311 -  
312 -if __name__ == '__main__':  
313 - #user = LocalSystem()  
314 - #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))  
315 -  
316 - cfg = Config()  
317 - #user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")  
318 - user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")  
319 - #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a5834eb2aba6b6f17312239e0761c70")  
320 - user.filter_pkg_profile(cfg.pkgs_filter)  
321 - user.maximal_pkg_profile()  
322 -  
323 - if "content" in sys.argv or len(sys.argv)<2:  
324 - run_content(user,cfg)  
325 - if "collaborative" in sys.argv or len(sys.argv)<2:  
326 - run_collaborative(user,cfg)  
327 - if "hybrid" in sys.argv or len(sys.argv)<2:  
328 - run_hybrid(user,cfg)  
src/experiments/runner.py
@@ -1,171 +0,0 @@ @@ -1,171 +0,0 @@
1 -#!/usr/bin/env python  
2 -"""  
3 - recommender suite - recommender experiments suite  
4 -"""  
5 -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"  
6 -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"  
7 -__license__ = """  
8 - This program is free software: you can redistribute it and/or modify  
9 - it under the terms of the GNU General Public License as published by  
10 - the Free Software Foundation, either version 3 of the License, or  
11 - (at your option) any later version.  
12 -  
13 - This program is distributed in the hope that it will be useful,  
14 - but WITHOUT ANY WARRANTY; without even the implied warranty of  
15 - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
16 - GNU General Public License for more details.  
17 -  
18 - You should have received a copy of the GNU General Public License  
19 - along with this program. If not, see <http://www.gnu.org/licenses/>.  
20 -"""  
21 -  
22 -import expsuite  
23 -import sys  
24 -sys.path.insert(0,'../')  
25 -from config import Config  
26 -from data import PopconXapianIndex, PopconSubmission  
27 -from recommender import Recommender  
28 -from user import LocalSystem, User  
29 -from evaluation import *  
30 -import logging  
31 -import random  
32 -import Gnuplot  
33 -  
34 -class ClusteringSuite(expsuite.PyExperimentSuite):  
35 - def reset(self, params, rep):  
36 - self.cfg = Config()  
37 - self.cfg.popcon_index = "../tests/test_data/.sample_pxi"  
38 - self.cfg.popcon_dir = "../tests/test_data/popcon_dir"  
39 - self.cfg.clusters_dir = "../tests/test_data/clusters_dir"  
40 -  
41 - if params['name'] == "clustering":  
42 - logging.info("Starting 'clustering' experiments suite...")  
43 - self.cfg.index_mode = "recluster"  
44 -  
45 - def iterate(self, params, rep, n):  
46 - if params['name'] == "clustering":  
47 - logging.info("Running iteration %d" % params['medoids'][n])  
48 - self.cfg.k_medoids = params['medoids'][n]  
49 - pxi = PopconXapianIndex(self.cfg)  
50 - result = {'k_medoids': params['medoids'][n],  
51 - 'dispersion': pxi.cluster_dispersion}  
52 - else:  
53 - result = {}  
54 - return result  
55 -  
56 -class ContentBasedSuite(expsuite.PyExperimentSuite):  
57 - def reset(self, params, rep):  
58 - if params['name'].startswith("content"):  
59 - cfg = Config()  
60 - #if the index was not built yet  
61 - #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi")  
62 - cfg.axi = "data/AppAxi"  
63 - cfg.index_mode = "old"  
64 - cfg.weight = params['weight']  
65 - self.rec = Recommender(cfg)  
66 - self.rec.set_strategy(params['strategy'])  
67 - self.repo_size = self.rec.items_repository.get_doccount()  
68 - self.user = LocalSystem()  
69 - self.user.app_pkg_profile(self.rec.items_repository)  
70 - self.user.no_auto_pkg_profile()  
71 - self.sample_size = int(len(self.user.pkg_profile)*params['sample'])  
72 - # iteration should be set to 10 in config file  
73 - #self.profile_size = range(10,101,10)  
74 -  
75 - def iterate(self, params, rep, n):  
76 - if params['name'].startswith("content"):  
77 - item_score = dict.fromkeys(self.user.pkg_profile,1)  
78 - # Prepare partition  
79 - sample = {}  
80 - for i in range(self.sample_size):  
81 - key = random.choice(item_score.keys())  
82 - sample[key] = item_score.pop(key)  
83 - # Get full recommendation  
84 - user = User(item_score)  
85 - recommendation = self.rec.get_recommendation(user,self.repo_size)  
86 - # Write recall log  
87 - recall_file = "results/content/recall/%s-%s-%.2f-%d" % \  
88 - (params['strategy'],params['weight'],params['sample'],n)  
89 - output = open(recall_file,'w')  
90 - output.write("# weight=%s\n" % params['weight'])  
91 - output.write("# strategy=%s\n" % params['strategy'])  
92 - output.write("# sample=%f\n" % params['sample'])  
93 - output.write("\n%d %d %d\n" % \  
94 - (self.repo_size,len(item_score),self.sample_size))  
95 - notfound = []  
96 - ranks = []  
97 - for pkg in sample.keys():  
98 - if pkg in recommendation.ranking:  
99 - ranks.append(recommendation.ranking.index(pkg))  
100 - else:  
101 - notfound.append(pkg)  
102 - for r in sorted(ranks):  
103 - output.write(str(r)+"\n")  
104 - if notfound:  
105 - output.write("Out of recommendation:\n")  
106 - for pkg in notfound:  
107 - output.write(pkg+"\n")  
108 - output.close()  
109 - # Plot metrics summary  
110 - accuracy = []  
111 - precision = []  
112 - recall = []  
113 - f1 = []  
114 - g = Gnuplot.Gnuplot()  
115 - g('set style data lines')  
116 - g.xlabel('Recommendation size')  
117 - for size in range(1,len(recommendation.ranking)+1,100):  
118 - predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))  
119 - real = RecommendationResult(sample)  
120 - evaluation = Evaluation(predicted,real,self.repo_size)  
121 - accuracy.append([size,evaluation.run(Accuracy())])  
122 - precision.append([size,evaluation.run(Precision())])  
123 - recall.append([size,evaluation.run(Recall())])  
124 - f1.append([size,evaluation.run(F1())])  
125 - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),  
126 - Gnuplot.Data(precision,title="Precision"),  
127 - Gnuplot.Data(recall,title="Recall"),  
128 - Gnuplot.Data(f1,title="F1"))  
129 - g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1)  
130 - # Iteration log  
131 - result = {'iteration': n,  
132 - 'weight': params['weight'],  
133 - 'strategy': params['strategy'],  
134 - 'accuracy': accuracy[20],  
135 - 'precision': precision[20],  
136 - 'recall:': recall[20],  
137 - 'f1': f1[20]}  
138 - return result  
139 -  
140 -#class CollaborativeSuite(expsuite.PyExperimentSuite):  
141 -# def reset(self, params, rep):  
142 -# if params['name'].startswith("collaborative"):  
143 -#  
144 -# def iterate(self, params, rep, n):  
145 -# if params['name'].startswith("collaborative"):  
146 -# for root, dirs, files in os.walk(self.source_dir):  
147 -# for popcon_file in files:  
148 -# submission = PopconSubmission(os.path.join(root,popcon_file))  
149 -# user = User(submission.packages)  
150 -# user.maximal_pkg_profile()  
151 -# rec.get_recommendation(user)  
152 -# precision = 0  
153 -# result = {'weight': params['weight'],  
154 -# 'strategy': params['strategy'],  
155 -# 'profile_size': self.profile_size[n],  
156 -# 'accuracy': accuracy,  
157 -# 'precision': precision,  
158 -# 'recall:': recall,  
159 -# 'f1': }  
160 -# else:  
161 -# result = {}  
162 -# return result  
163 -  
164 -if __name__ == '__main__':  
165 -  
166 - if "clustering" in sys.argv or len(sys.argv)<3:  
167 - ClusteringSuite().start()  
168 - if "content" in sys.argv or len(sys.argv)<3:  
169 - ContentBasedSuite().start()  
170 - #if "collaborative" in sys.argv or len(sys.argv)<3:  
171 - #CollaborativeSuite().start()  
src/experiments/sample-popcon-arch.py 0 → 100755
@@ -0,0 +1,44 @@ @@ -0,0 +1,44 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + sample-popcon-arch - extract a sample of a specific arch
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +import sys
  22 +sys.path.insert(0,'../')
  23 +import xapian
  24 +import os
  25 +import random
  26 +import sys
  27 +from user import RandomPopcon
  28 +
  29 +if __name__ == '__main__':
  30 + try:
  31 + size = int(sys.argv[1])
  32 + arch = sys.argv[2]
  33 + popcon_dir = sys.argv[3]
  34 + pkgs_filter = sys.argv[4]
  35 + except:
  36 + print "Usage: sample-popcon-arch size arch popcon_dir pkgs_filter"
  37 + exit(1)
  38 +
  39 + sample_file = ("results/misc-popcon/sample-%s-%d" % (arch,size))
  40 + with open(sample_file,'w') as f:
  41 + for n in range(1,size+1):
  42 + user = RandomPopcon(popcon_dir,arch,pkgs_filter)
  43 + f.write(user.user_id+'\n')
  44 + print "sample",n
src/experiments/strategies-suite.py
@@ -1,274 +0,0 @@ @@ -1,274 +0,0 @@
1 -#!/usr/bin/env python  
2 -"""  
3 - recommender suite - recommender experiments suite  
4 -"""  
5 -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"  
6 -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"  
7 -__license__ = """  
8 - This program is free software: you can redistribute it and/or modify  
9 - it under the terms of the GNU General Public License as published by  
10 - the Free Software Foundation, either version 3 of the License, or  
11 - (at your option) any later version.  
12 -  
13 - This program is distributed in the hope that it will be useful,  
14 - but WITHOUT ANY WARRANTY; without even the implied warranty of  
15 - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
16 - GNU General Public License for more details.  
17 -  
18 - You should have received a copy of the GNU General Public License  
19 - along with this program. If not, see <http://www.gnu.org/licenses/>.  
20 -"""  
21 -  
22 -import sys  
23 -sys.path.insert(0,'../')  
24 -from config import Config  
25 -from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex  
26 -from recommender import Recommender  
27 -from user import LocalSystem, User  
28 -from evaluation import *  
29 -import logging  
30 -import random  
31 -import Gnuplot  
32 -  
33 -#iterations = 3  
34 -#sample_proportions = [0.9]  
35 -#weighting = [('bm25',1.2)]  
36 -#collaborative = ['knn']  
37 -#content_based = []  
38 -#hybrid = ['knnco']  
39 -#profile_size = [50,100]  
40 -#popcon_size = ["1000"]  
41 -#neighbors = [50]  
42 -  
43 -iterations = 10  
44 -sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]  
45 -weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]  
46 -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']  
47 -collaborative = ['knn_eset','knn','knn_plus']  
48 -hybrid = ['knnco','knnco_eset']  
49 -  
50 -profile_size = range(20,100,20)  
51 -#popcon_size = [1000,10000,50000,'full']  
52 -neighbors = range(10,510,50)  
53 -  
54 -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):  
55 - # Write recall log  
56 - output = open(("%s-%d" % (log_file,n)),'w')  
57 - output.write("# %s-n\n" % label["description"])  
58 - output.write("# %s-%d\n" % (label["values"],n))  
59 - output.write("\n%d %d %d\n" % \  
60 - (repo_size,profile_size,len(sample)))  
61 - if hasattr(recommendation,"ranking"):  
62 - notfound = []  
63 - ranks = []  
64 - for pkg in sample.keys():  
65 - if pkg in recommendation.ranking:  
66 - ranks.append(recommendation.ranking.index(pkg))  
67 - else:  
68 - notfound.append(pkg)  
69 - for r in sorted(ranks):  
70 - output.write(str(r)+"\n")  
71 - if notfound:  
72 - output.write("Out of recommendation:\n")  
73 - for pkg in notfound:  
74 - output.write(pkg+"\n")  
75 - output.close()  
76 -  
77 -def plot_summary(precision,recall,f1,f05,accuracy,log_file):  
78 - # Plot metrics summary  
79 - g = Gnuplot.Gnuplot()  
80 - g('set style data lines')  
81 - g.xlabel('Recommendation size')  
82 - g.title("Setup: %s" % log_file.split("/")[-1])  
83 - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),  
84 - Gnuplot.Data(precision,title="Precision"),  
85 - Gnuplot.Data(recall,title="Recall"),  
86 - Gnuplot.Data(f1,title="F_1"),  
87 - Gnuplot.Data(f05,title="F_0.5"))  
88 - g.hardcopy(log_file+".png",terminal="png")  
89 - g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)  
90 - g('set logscale x')  
91 - g('replot')  
92 - g.hardcopy(log_file+"-logscale.png",terminal="png")  
93 - g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)  
94 -  
95 -def get_label(cfg,sample_proportion):  
96 - label = {}  
97 - if cfg.strategy in content_based:  
98 - label["description"] = "strategy-filter-profile-k1_bm25-sample"  
99 - label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %  
100 - (cfg.strategy,cfg.profile_size,  
101 - cfg.pkgs_filter.split("/")[-1],  
102 - cfg.bm25_k1,sample_proportion))  
103 - elif cfg.strategy in collaborative:  
104 - label["description"] = "strategy-knn-filter-k1_bm25-sample"  
105 - label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %  
106 - (cfg.strategy,cfg.k_neighbors,  
107 - cfg.pkgs_filter.split("/")[-1],  
108 - cfg.bm25_k1,sample_proportion))  
109 - elif cfg.strategy in hybrid:  
110 - label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"  
111 - label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %  
112 - (cfg.strategy,cfg.k_neighbors,cfg.profile_size,  
113 - cfg.pkgs_filter.split("/")[-1],  
114 - cfg.bm25_k1,sample_proportion))  
115 - else:  
116 - print "Unknown strategy"  
117 - return label  
118 -  
119 -class ExperimentResults:  
120 - def __init__(self,repo_size):  
121 - self.repository_size = repo_size  
122 - self.accuracy = {}  
123 - self.precision = {}  
124 - self.recall = {}  
125 - self.f1 = {}  
126 - self.f05 = {}  
127 - points = [1]+range(10,200,10)+range(200,self.repository_size,100)  
128 - for size in points:  
129 - self.accuracy[size] = []  
130 - self.precision[size] = []  
131 - self.recall[size] = []  
132 - self.f1[size] = []  
133 - self.f05[size] = []  
134 -  
135 - def add_result(self,ranking,sample):  
136 - for size in self.accuracy.keys():  
137 - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))  
138 - real = RecommendationResult(sample)  
139 - evaluation = Evaluation(predicted,real,self.repository_size)  
140 - self.accuracy[size].append(evaluation.run(Accuracy()))  
141 - self.precision[size].append(evaluation.run(Precision()))  
142 - self.recall[size].append(evaluation.run(Recall()))  
143 - self.f1[size].append(evaluation.run(F_score(1)))  
144 - self.f05[size].append(evaluation.run(F_score(0.5)))  
145 -  
146 - def get_precision_summary(self):  
147 - summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]  
148 - return sorted(summary)  
149 -  
150 - def get_recall_summary(self):  
151 - summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]  
152 - return sorted(summary)  
153 -  
154 - def get_f1_summary(self):  
155 - summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]  
156 - return sorted(summary)  
157 -  
158 - def get_f05_summary(self):  
159 - summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]  
160 - return sorted(summary)  
161 -  
162 - def get_accuracy_summary(self):  
163 - summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]  
164 - return sorted(summary)  
165 -  
166 - def best_precision(self):  
167 - size = max(self.precision, key = lambda x: max(self.precision[x]))  
168 - return (size,max(self.precision[size]))  
169 -  
170 - def best_f1(self):  
171 - size = max(self.f1, key = lambda x: max(self.f1[x]))  
172 - return (size,max(self.f1[size]))  
173 -  
174 - def best_f05(self):  
175 - size = max(self.f05, key = lambda x: max(self.f05[x]))  
176 - return (size,max(self.f05[size]))  
177 -  
178 -def run_strategy(cfg,user):  
179 - for weight in weighting:  
180 - cfg.weight = weight[0]  
181 - cfg.bm25_k1 = weight[1]  
182 - rec = Recommender(cfg)  
183 - repo_size = rec.items_repository.get_doccount()  
184 - for proportion in sample_proportions:  
185 - results = ExperimentResults(repo_size)  
186 - label = get_label(cfg,proportion)  
187 - log_file = "results/strategies/"+label["values"]  
188 - for n in range(iterations):  
189 - # Fill sample profile  
190 - profile_size = len(user.pkg_profile)  
191 - item_score = {}  
192 - for pkg in user.pkg_profile:  
193 - item_score[pkg] = user.item_score[pkg]  
194 - sample = {}  
195 - sample_size = int(profile_size*proportion)  
196 - for i in range(sample_size):  
197 - key = random.choice(item_score.keys())  
198 - sample[key] = item_score.pop(key)  
199 - iteration_user = User(item_score)  
200 - recommendation = rec.get_recommendation(iteration_user,repo_size)  
201 - write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)  
202 - if hasattr(recommendation,"ranking"):  
203 - results.add_result(recommendation.ranking,sample)  
204 - with open(log_file,'w') as f:  
205 - precision_10 = sum(results.precision[10])/len(results.precision[10])  
206 - f1_10 = sum(results.f1[10])/len(results.f1[10])  
207 - f05_10 = sum(results.f05[10])/len(results.f05[10])  
208 - f.write("# %s\n# %s\n\ncoverage %d\n\n" %  
209 - (label["description"],label["values"],recommendation.size))  
210 - f.write("# best results (recommendation size; metric)\n")  
211 - f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %  
212 - (results.best_precision()[0],results.best_precision()[1],  
213 - results.best_f1()[0],results.best_f1()[1],  
214 - results.best_f05()[0],results.best_f05()[1]))  
215 - f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %  
216 - (precision_10,f1_10,f05_10))  
217 - precision = results.get_precision_summary()  
218 - recall = results.get_recall_summary()  
219 - f1 = results.get_f1_summary()  
220 - f05 = results.get_f05_summary()  
221 - accuracy = results.get_accuracy_summary()  
222 - plot_summary(precision,recall,f1,f05,accuracy,log_file)  
223 -  
224 -def run_content(user,cfg):  
225 - for strategy in content_based:  
226 - cfg.strategy = strategy  
227 - for size in profile_size:  
228 - cfg.profile_size = size  
229 - run_strategy(cfg,user)  
230 -  
231 -def run_collaborative(user,cfg):  
232 - popcon_desktopapps = cfg.popcon_desktopapps  
233 - popcon_programs = cfg.popcon_programs  
234 - for strategy in collaborative:  
235 - cfg.strategy = strategy  
236 - for k in neighbors:  
237 - cfg.k_neighbors = k  
238 - #for size in popcon_size:  
239 - # if size:  
240 - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size  
241 - # cfg.popcon_programs = popcon_programs+"_"+size  
242 - run_strategy(cfg,user)  
243 -  
244 -def run_hybrid(user,cfg):  
245 - popcon_desktopapps = cfg.popcon_desktopapps  
246 - popcon_programs = cfg.popcon_programs  
247 - for strategy in hybrid:  
248 - cfg.strategy = strategy  
249 - for k in neighbors:  
250 - cfg.k_neighbors = k  
251 - #for size in popcon_size:  
252 - # if size:  
253 - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size  
254 - # cfg.popcon_programs = popcon_programs+"_"+size  
255 - for size in profile_size:  
256 - cfg.profile_size = size  
257 - run_strategy(cfg,user)  
258 -  
259 -if __name__ == '__main__':  
260 - #user = LocalSystem()  
261 - #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))  
262 -  
263 - cfg = Config()  
264 - user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")  
265 - #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")  
266 - user.filter_pkg_profile(cfg.pkgs_filter)  
267 - user.maximal_pkg_profile()  
268 -  
269 - if "content" in sys.argv or len(sys.argv)<2:  
270 - run_content(user,cfg)  
271 - if "collaborative" in sys.argv or len(sys.argv)<2:  
272 - run_collaborative(user,cfg)  
273 - if "hybrid" in sys.argv or len(sys.argv)<2:  
274 - run_hybrid(user,cfg)  
@@ -111,7 +111,7 @@ class User: @@ -111,7 +111,7 @@ class User:
111 """ 111 """
112 Define a user of a recommender. 112 Define a user of a recommender.
113 """ 113 """
114 - def __init__(self,item_score,user_id=0,demo_profiles_set=0): 114 + def __init__(self,item_score,user_id=0,arch=0,demo_profiles_set=0):
115 """ 115 """
116 Set initial user attributes. pkg_profile gets the whole set of items, 116 Set initial user attributes. pkg_profile gets the whole set of items,
117 a random user_id is set if none was provided and the demographic 117 a random user_id is set if none was provided and the demographic
@@ -119,6 +119,7 @@ class User: @@ -119,6 +119,7 @@ class User:
119 """ 119 """
120 self.item_score = item_score 120 self.item_score = item_score
121 self.pkg_profile = self.items() 121 self.pkg_profile = self.items()
  122 + self.arch = arch
122 123
123 if user_id: 124 if user_id:
124 self.user_id = user_id 125 self.user_id = user_id
@@ -272,21 +273,28 @@ class User: @@ -272,21 +273,28 @@ class User:
272 return self.pkg_profile 273 return self.pkg_profile
273 274
274 class RandomPopcon(User): 275 class RandomPopcon(User):
275 - def __init__(self,submissions_dir,pkgs_filter=0): 276 + def __init__(self,submissions_dir,arch=0,pkgs_filter=0):
276 """ 277 """
277 Set initial parameters. 278 Set initial parameters.
278 """ 279 """
279 len_profile = 0 280 len_profile = 0
280 - while len_profile < 100: 281 + match_arch = False
  282 + while len_profile < 100 or not match_arch:
281 path = random.choice([os.path.join(root, submission) for 283 path = random.choice([os.path.join(root, submission) for
282 root, dirs, files in os.walk(submissions_dir) 284 root, dirs, files in os.walk(submissions_dir)
283 for submission in files]) 285 for submission in files])
284 user = PopconSystem(path) 286 user = PopconSystem(path)
  287 + print arch
  288 + print user.arch
  289 + if arch and user.arch==arch:
  290 + match_arch = True
  291 + print "match"
285 if pkgs_filter: 292 if pkgs_filter:
286 user.filter_pkg_profile(pkgs_filter) 293 user.filter_pkg_profile(pkgs_filter)
287 len_profile = len(user.pkg_profile) 294 len_profile = len(user.pkg_profile)
  295 + print "p",len_profile
288 submission = data.PopconSubmission(path) 296 submission = data.PopconSubmission(path)
289 - User.__init__(self,submission.packages,submission.user_id) 297 + User.__init__(self,submission.packages,submission.user_id,submission.arch)
290 298
291 class PopconSystem(User): 299 class PopconSystem(User):
292 def __init__(self,path,user_id=0): 300 def __init__(self,path,user_id=0):
@@ -296,7 +304,7 @@ class PopconSystem(User): @@ -296,7 +304,7 @@ class PopconSystem(User):
296 submission = data.PopconSubmission(path) 304 submission = data.PopconSubmission(path)
297 if not user_id: 305 if not user_id:
298 user_id = submission.user_id 306 user_id = submission.user_id
299 - User.__init__(self,submission.packages,user_id) 307 + User.__init__(self,submission.packages,user_id,submission.arch)
300 308
301 class PkgsListSystem(User): 309 class PkgsListSystem(User):
302 def __init__(self,pkgs_list_or_file,user_id=0): 310 def __init__(self,pkgs_list_or_file,user_id=0):