Commit 94efb102510a4e1f84d73385e35ce801856749d9

Authored by Tássia Camões Araújo
2 parents ef8c9733 b33c0cb1
Exists in master and in 1 other branch add_vagrant

Merge branch 'master' of https://github.com/tassia/AppRecommender

README
... ... @@ -6,10 +6,7 @@ Install dependencies
6 6  
7 7 # apt-get install \
8 8 python python-xapian python-apt python-cluster python-webpy python-simplejson \
9   -python-unittest2 python-numpy python-gnuplot \
10   -apt-xapian-index gnuplot
11   -
12   -# cd ./src; git clone https://github.com/rueckstiess/expsuite
  9 +python-numpy apt-xapian-index app-install-data python-xdg
13 10  
14 11  
15 12 Run AppRecommender web UI
... ... @@ -20,4 +17,5 @@ Run AppRecommender web UI
20 17  
21 18 Open a browser and access http://localhost:8080
22 19  
  20 +
23 21 More info at https://github.com/tassia/AppRecommender/wiki
... ...
src/bin/cross_validation.py
... ... @@ -37,7 +37,7 @@ if __name__ == '__main__':
37 37 #user = LocalSystem()
38 38 #user = RandomPopcon(cfg.popcon_dir)
39 39 #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
40   - user = PopconSystem("/home/tassia/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
  40 + user = PopconSystem(os.path.expanduser("~/.app-recommender/popcon-entries/00/0001166d0737c6dffb083071e5ee69f5"))
41 41 user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps"))
42 42 user.maximal_pkg_profile()
43 43 begin_time = datetime.datetime.now()
... ... @@ -48,7 +48,7 @@ if __name__ == '__main__':
48 48 metrics.append(F_score(0.5))
49 49 metrics.append(Accuracy())
50 50 metrics.append(FPR())
51   - validation = CrossValidation(0.9,10,rec,metrics,1)
  51 + validation = CrossValidation(0.9,20,rec,metrics,0.005)
52 52 validation.run(user)
53 53 print validation
54 54  
... ...
src/bin/get_axipkgs.py 0 → 100755
... ... @@ -0,0 +1,42 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + AppRecommender - A GNU/Linux application recommender
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import os
  23 +import sys
  24 +sys.path.insert(0,'../')
  25 +import xapian
  26 +
  27 +if __name__ == '__main__':
  28 + if len(sys.argv)<2:
  29 + print "Usage: get_axipkgs index_path"
  30 + exit(1)
  31 +
  32 + axi_path = sys.argv[1]
  33 + axi = xapian.Database(axi_path)
  34 + for n in range(1,axi.get_lastdocid()):
  35 + doc = 0
  36 + try:
  37 + doc = axi.get_document(n)
  38 + except:
  39 + pass
  40 + if doc:
  41 + xp_terms = [t.term for t in doc.termlist() if t.term.startswith("XP")]
  42 + print xp_terms[0].lstrip('XP')
... ...
src/bin/get_desktop.sh
1 1 #!/usr/bin/env bash
2 2 #
3   -# get_desktop.sh - get packages which have desktop files
  3 +# get_desktop.sh - get packages which have desktop files
  4 +#
  5 +# DEPRECATED: use get_axipkgs.py to get this info from axi
4 6  
5 7 cd /usr/share/app-install/desktop
6 8 sed -ne 's/X-AppInstall-Package=//p' * | sort -u | grep -v kdelibs | grep -v libfm-gtk0
... ...
src/bin/get_pkgs_inst.py
1 1 #!/usr/bin/env python
2 2 #
3 3 # get_pkgs_inst.py - get tuple (package,installation) from popcon results file
  4 +#
  5 +# results_file: org/popcon.debian.org/popcon-mail/results
4 6  
  7 +import sys
5 8 from operator import itemgetter
  9 +
6 10 if __name__ == '__main__':
  11 + if len(sys.argv)<2:
  12 + print "Usage: get_pkgs_inst popcon_results_path"
  13 + exit(1)
  14 +
  15 + results_path = sys.argv[1]
7 16 pkgs_inst = {}
8   - with open("/root/org/popcon.debian.org/popcon-mail/results") as results:
  17 + with open(results_path) as results:
9 18 for line in results:
10 19 if line.startswith("Package"):
11 20 fields = line.split()
12 21 inst = int(fields[2])+int(fields[3])+int(fields[4])
13   - if inst > 20:
14   - pkgs_inst[fields[1]] = inst
  22 + pkgs_inst[fields[1]] = inst
15 23 sorted_by_inst = sorted(pkgs_inst.items(), key=itemgetter(1))
16 24 for pkg, inst in sorted_by_inst:
17 25 print pkg, inst
... ...
src/config.py
... ... @@ -40,7 +40,7 @@ class Config(Singleton):
40 40 ## general options
41 41 self.debug = 0
42 42 self.verbose = 1
43   - self.output = "log"
  43 + self.output = "apprec.log"
44 44  
45 45 ## data_source options
46 46 self.base_dir = os.path.expanduser("/home/tiago/.app-recommender/")
... ... @@ -103,13 +103,14 @@ class Config(Singleton):
103 103 print " -f, --filtersdir=PATH Path to filters directory"
104 104 print " -b, --pkgsfilter=FILTER File containing packages to be considered for recommendations"
105 105 print " -a, --axi=PATH Path to apt-xapian-index"
106   - print " -e, --dde=URL DDE url"
107 106 print " -p, --popconindex=PATH Path to popcon index"
108   - print " -m, --popcondir=PATH Path to popcon submissions dir"
109   - print " -u, --indexmode=MODE 'old'|'reindex'|'cluster'|'recluster'"
110   - print " -l, --clustersdir=PATH Path to popcon clusters dir"
111   - print " -c, --medoids=k Number of medoids for clustering"
112   - print " -x, --maxpopcon=k Number of submissions to be considered"
  107 + print " -e, --dde=URL DDE url"
  108 + # deprecated options
  109 + #print " -m, --popcondir=PATH Path to popcon submissions dir"
  110 + #print " -u, --indexmode=MODE 'old'|'reindex'|'cluster'|'recluster'"
  111 + #print " -l, --clustersdir=PATH Path to popcon clusters dir"
  112 + #print " -c, --medoids=k Number of medoids for clustering"
  113 + #print " -x, --maxpopcon=k Number of submissions to be considered"
113 114 print ""
114 115 print " [ recommender ]"
115 116 print " -w, --weight=OPTION Search weighting scheme"
... ... @@ -123,11 +124,19 @@ class Config(Singleton):
123 124 print " bm25 = bm25 weighting scheme"
124 125 print ""
125 126 print " [ strategy options ] "
126   - print " cb = content-based "
127   - print " cbt = content-based using only tags as content "
128   - print " cbd = content-based using only package descriptions as content "
129   - print " col = collaborative "
130   - print " colct = collaborative through tags content "
  127 + print " cb = content-based, mixed profile"
  128 + print " cbt = content-based, tags only profile"
  129 + print " cbd = content-based, description terms only profile"
  130 + print " cbh = content-based, half-half profile"
  131 + print " cb_eset = cb with eset profiling"
  132 + print " cbt_eset = cbt with eset profiling"
  133 + print " cbd_eset = cbd_eset with eset profiling"
  134 + print " cbh_eset = cbh with eset profiling"
  135 + print " knn = collaborative, tf-idf knn"
  136 + print " knn_plus = collaborative, tf-idf weighted knn"
  137 + print " knn_eset = collaborative, eset knn"
  138 + print " knnco = collaborative through content"
  139 + print " knnco_eset = collaborative through content, eset recommendation"
131 140  
132 141 def read_option(self, section, option):
133 142 """
... ...
src/evaluation.py
... ... @@ -140,6 +140,29 @@ class FPR(Metric):
140 140 return (float(len(evaluation.false_positive))/
141 141 evaluation.real_negative_len)
142 142  
  143 +class MCC(Metric):
  144 + """
  145 + Matthews correlation coefficient.
  146 + """
  147 + def __init__(self):
  148 + """
  149 + Set metric description.
  150 + """
  151 + self.desc = " MCC "
  152 +
  153 + def run(self,evaluation):
  154 + """
  155 + Compute metric.
  156 + """
  157 + VP = len(evaluation.true_positive)
  158 + FP = len(evaluation.false_positive)
  159 + FN = len(evaluation.false_negative)
  160 + VN = evaluation.true_negative_len
  161 + if (VP+FP)==0 or (VP+FN)==0 or (VN+FP)==0 or (VN+FN)==0:
  162 + return 0
  163 + MCC = (((VP*VN)-(FP*FN))/math.sqrt((VP+FP)*(VP+FN)*(VN+FP)*(VN+FN)))
  164 + return MCC
  165 +
143 166 class F_score(Metric):
144 167 """
145 168 Classification accuracy metric which correlates precision and recall into an
... ...
src/experiments/README
1   -Experiments handled by expsuite:
2   -https://github.com/rueckstiess/expsuite
  1 +AppRecommender experiments and tests
  2 +---------------------------------------
  3 +
  4 +Install dependencies:
  5 +
  6 +# apt-get install \
  7 +python-unittest2 python-gnuplot gnuplot
  8 +
  9 +# cd ./src; git clone https://github.com/rueckstiess/expsuite (deprecated tests)
... ...
src/experiments/deprecated/k-suite.py 0 → 100755
... ... @@ -0,0 +1,186 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + k-suite - experiment different neighborhood sizes
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +def plot_roc(k,roc_points,log_file):
  35 + g = Gnuplot.Gnuplot()
  36 + g('set style data points')
  37 + g.xlabel('False Positive Rate')
  38 + g.ylabel('True Positive Rate')
  39 + g('set xrange [0:1.0]')
  40 + g('set yrange [0:1.0]')
  41 + g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
  42 + g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
  43 + Gnuplot.Data(roc_points))
  44 + g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
  45 + g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
  46 +
  47 +def plot_summary(precision,f05,mcc,log_file):
  48 + g = Gnuplot.Gnuplot()
  49 + g('set style data lines')
  50 + g.xlabel('Neighborhood (k)')
  51 + g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
  52 + g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
  53 + Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
  54 + Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
  55 + g.hardcopy(log_file+(".png"),terminal="png")
  56 + g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
  57 +
  58 +class ExperimentResults:
  59 + def __init__(self,repo_size):
  60 + self.repository_size = repo_size
  61 + self.precision = []
  62 + self.recall = []
  63 + self.fpr = []
  64 + self.f05 = []
  65 + self.mcc = []
  66 +
  67 + def add_result(self,ranking,sample):
  68 + predicted = RecommendationResult(dict.fromkeys(ranking,1))
  69 + real = RecommendationResult(sample)
  70 + evaluation = Evaluation(predicted,real,self.repository_size)
  71 + self.precision.append(evaluation.run(Precision()))
  72 + self.recall.append(evaluation.run(Recall()))
  73 + self.fpr.append(evaluation.run(FPR()))
  74 + self.f05.append(evaluation.run(F_score(0.5)))
  75 + self.mcc.append(evaluation.run(MCC()))
  76 +
  77 + def get_roc_point(self):
  78 + tpr = self.recall
  79 + fpr = self.fpr
  80 + if not tpr or not fpr:
  81 + return [0,0]
  82 + return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
  83 +
  84 + def get_precision_summary(self):
  85 + if not self.precision: return 0
  86 + return sum(self.precision)/len(self.precision)
  87 +
  88 + def get_f05_summary(self):
  89 + if not self.f05: return 0
  90 + return sum(self.f05)/len(self.f05)
  91 +
  92 + def get_mcc_summary(self):
  93 + if not self.mcc: return 0
  94 + return sum(self.mcc)/len(self.mcc)
  95 +
  96 +if __name__ == '__main__':
  97 + if len(sys.argv)<3:
  98 + print "Usage: k-suite strategy_str sample_file"
  99 + exit(1)
  100 + threshold = 20
  101 + iterations = 30
  102 + neighbors = [3,5,10,50,100,150,200,300,400,500]
  103 + cfg = Config()
  104 + cfg.strategy = sys.argv[1]
  105 + sample_file = sys.argv[2]
  106 + population_sample = []
  107 + with open(sample_file,'r') as f:
  108 + for line in f.readlines():
  109 + user_id = line.strip('\n')
  110 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  111 + # setup dictionaries and files
  112 + roc_summary = {}
  113 + recommended = {}
  114 + precision_summary = {}
  115 + f05_summary = {}
  116 + mcc_summary = {}
  117 + sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
  118 + if not os.path.exists(sample_dir):
  119 + os.makedirs(sample_dir)
  120 + log_file = os.path.join(sample_dir,cfg.strategy)
  121 + with open(log_file,'w') as f:
  122 + f.write("# %s\n\n" % sample_file.split('/')[-1])
  123 + f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
  124 + (cfg.strategy,threshold,iterations))
  125 + f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
  126 +
  127 + for k in neighbors:
  128 + roc_summary[k] = []
  129 + recommended[k] = set()
  130 + precision_summary[k] = []
  131 + f05_summary[k] = []
  132 + mcc_summary[k] = []
  133 + with open(log_file+"-k%.3d"%k,'w') as f:
  134 + f.write("# %s\n\n" % sample_file.split('/')[-1])
  135 + f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
  136 + f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
  137 +
  138 + # main loop per user
  139 + for submission_file in population_sample:
  140 + user = PopconSystem(submission_file)
  141 + user.filter_pkg_profile(cfg.pkgs_filter)
  142 + user.maximal_pkg_profile()
  143 + for k in neighbors:
  144 + cfg.k_neighbors = k
  145 + rec = Recommender(cfg)
  146 + repo_size = rec.items_repository.get_doccount()
  147 + results = ExperimentResults(repo_size)
  148 + # n iterations for same recommender and user
  149 + for n in range(iterations):
  150 + # Fill sample profile
  151 + profile_len = len(user.pkg_profile)
  152 + item_score = {}
  153 + for pkg in user.pkg_profile:
  154 + item_score[pkg] = user.item_score[pkg]
  155 + sample = {}
  156 + sample_size = int(profile_len*0.9)
  157 + for i in range(sample_size):
  158 + key = random.choice(item_score.keys())
  159 + sample[key] = item_score.pop(key)
  160 + iteration_user = User(item_score)
  161 + recommendation = rec.get_recommendation(iteration_user,threshold)
  162 + if hasattr(recommendation,"ranking"):
  163 + results.add_result(recommendation.ranking,sample)
  164 + recommended[k] = recommended[k].union(recommendation.ranking)
  165 + # save summary
  166 + roc_point = results.get_roc_point()
  167 + roc_summary[k].append(roc_point)
  168 + precision = results.get_precision_summary()
  169 + precision_summary[k].append(precision)
  170 + f05 = results.get_f05_summary()
  171 + f05_summary[k].append(f05)
  172 + mcc = results.get_mcc_summary()
  173 + mcc_summary[k].append(mcc)
  174 + with open(log_file+"-k%.3d"%k,'a') as f:
  175 + f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
  176 + (roc_point[0],roc_point[1],precision,f05,mcc))
  177 + # back to main flow
  178 + with open(log_file,'a') as f:
  179 + plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
  180 + for k in neighbors:
  181 + coverage = len(recommended[size])/float(repo_size)
  182 + f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
  183 + (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
  184 + float(sum(f05_summary[k]))/len(f05_summary[k]),
  185 + float(sum(mcc_summary[k]))/len(mcc_summary[k])))
  186 + plot_roc(k,roc_summary[k],log_file)
... ...
src/experiments/deprecated/strategies-suite.py 0 → 100755
... ... @@ -0,0 +1,274 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +
  33 +#iterations = 3
  34 +#sample_proportions = [0.9]
  35 +#weighting = [('bm25',1.2)]
  36 +#collaborative = ['knn']
  37 +#content_based = []
  38 +#hybrid = ['knnco']
  39 +#profile_size = [50,100]
  40 +#popcon_size = ["1000"]
  41 +#neighbors = [50]
  42 +
  43 +iterations = 10
  44 +sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
  45 +weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
  46 +content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  47 +collaborative = ['knn_eset','knn','knn_plus']
  48 +hybrid = ['knnco','knnco_eset']
  49 +
  50 +profile_size = range(20,100,20)
  51 +#popcon_size = [1000,10000,50000,'full']
  52 +neighbors = range(10,510,50)
  53 +
  54 +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
  55 + # Write recall log
  56 + output = open(("%s-%d" % (log_file,n)),'w')
  57 + output.write("# %s-n\n" % label["description"])
  58 + output.write("# %s-%d\n" % (label["values"],n))
  59 + output.write("\n%d %d %d\n" % \
  60 + (repo_size,profile_size,len(sample)))
  61 + if hasattr(recommendation,"ranking"):
  62 + notfound = []
  63 + ranks = []
  64 + for pkg in sample.keys():
  65 + if pkg in recommendation.ranking:
  66 + ranks.append(recommendation.ranking.index(pkg))
  67 + else:
  68 + notfound.append(pkg)
  69 + for r in sorted(ranks):
  70 + output.write(str(r)+"\n")
  71 + if notfound:
  72 + output.write("Out of recommendation:\n")
  73 + for pkg in notfound:
  74 + output.write(pkg+"\n")
  75 + output.close()
  76 +
  77 +def plot_summary(precision,recall,f1,f05,accuracy,log_file):
  78 + # Plot metrics summary
  79 + g = Gnuplot.Gnuplot()
  80 + g('set style data lines')
  81 + g.xlabel('Recommendation size')
  82 + g.title("Setup: %s" % log_file.split("/")[-1])
  83 + g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
  84 + Gnuplot.Data(precision,title="Precision"),
  85 + Gnuplot.Data(recall,title="Recall"),
  86 + Gnuplot.Data(f1,title="F_1"),
  87 + Gnuplot.Data(f05,title="F_0.5"))
  88 + g.hardcopy(log_file+".png",terminal="png")
  89 + g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
  90 + g('set logscale x')
  91 + g('replot')
  92 + g.hardcopy(log_file+"-logscale.png",terminal="png")
  93 + g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
  94 +
  95 +def get_label(cfg,sample_proportion):
  96 + label = {}
  97 + if cfg.strategy in content_based:
  98 + label["description"] = "strategy-filter-profile-k1_bm25-sample"
  99 + label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
  100 + (cfg.strategy,cfg.profile_size,
  101 + cfg.pkgs_filter.split("/")[-1],
  102 + cfg.bm25_k1,sample_proportion))
  103 + elif cfg.strategy in collaborative:
  104 + label["description"] = "strategy-knn-filter-k1_bm25-sample"
  105 + label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
  106 + (cfg.strategy,cfg.k_neighbors,
  107 + cfg.pkgs_filter.split("/")[-1],
  108 + cfg.bm25_k1,sample_proportion))
  109 + elif cfg.strategy in hybrid:
  110 + label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
  111 + label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
  112 + (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
  113 + cfg.pkgs_filter.split("/")[-1],
  114 + cfg.bm25_k1,sample_proportion))
  115 + else:
  116 + print "Unknown strategy"
  117 + return label
  118 +
  119 +class ExperimentResults:
  120 + def __init__(self,repo_size):
  121 + self.repository_size = repo_size
  122 + self.accuracy = {}
  123 + self.precision = {}
  124 + self.recall = {}
  125 + self.f1 = {}
  126 + self.f05 = {}
  127 + points = [1]+range(10,200,10)+range(200,self.repository_size,100)
  128 + for size in points:
  129 + self.accuracy[size] = []
  130 + self.precision[size] = []
  131 + self.recall[size] = []
  132 + self.f1[size] = []
  133 + self.f05[size] = []
  134 +
  135 + def add_result(self,ranking,sample):
  136 + for size in self.accuracy.keys():
  137 + predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
  138 + real = RecommendationResult(sample)
  139 + evaluation = Evaluation(predicted,real,self.repository_size)
  140 + self.accuracy[size].append(evaluation.run(Accuracy()))
  141 + self.precision[size].append(evaluation.run(Precision()))
  142 + self.recall[size].append(evaluation.run(Recall()))
  143 + self.f1[size].append(evaluation.run(F_score(1)))
  144 + self.f05[size].append(evaluation.run(F_score(0.5)))
  145 +
  146 + def get_precision_summary(self):
  147 + summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
  148 + return sorted(summary)
  149 +
  150 + def get_recall_summary(self):
  151 + summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
  152 + return sorted(summary)
  153 +
  154 + def get_f1_summary(self):
  155 + summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
  156 + return sorted(summary)
  157 +
  158 + def get_f05_summary(self):
  159 + summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
  160 + return sorted(summary)
  161 +
  162 + def get_accuracy_summary(self):
  163 + summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
  164 + return sorted(summary)
  165 +
  166 + def best_precision(self):
  167 + size = max(self.precision, key = lambda x: max(self.precision[x]))
  168 + return (size,max(self.precision[size]))
  169 +
  170 + def best_f1(self):
  171 + size = max(self.f1, key = lambda x: max(self.f1[x]))
  172 + return (size,max(self.f1[size]))
  173 +
  174 + def best_f05(self):
  175 + size = max(self.f05, key = lambda x: max(self.f05[x]))
  176 + return (size,max(self.f05[size]))
  177 +
  178 +def run_strategy(cfg,user):
  179 + for weight in weighting:
  180 + cfg.weight = weight[0]
  181 + cfg.bm25_k1 = weight[1]
  182 + rec = Recommender(cfg)
  183 + repo_size = rec.items_repository.get_doccount()
  184 + for proportion in sample_proportions:
  185 + results = ExperimentResults(repo_size)
  186 + label = get_label(cfg,proportion)
  187 + log_file = "results/strategies/"+label["values"]
  188 + for n in range(iterations):
  189 + # Fill sample profile
  190 + profile_size = len(user.pkg_profile)
  191 + item_score = {}
  192 + for pkg in user.pkg_profile:
  193 + item_score[pkg] = user.item_score[pkg]
  194 + sample = {}
  195 + sample_size = int(profile_size*proportion)
  196 + for i in range(sample_size):
  197 + key = random.choice(item_score.keys())
  198 + sample[key] = item_score.pop(key)
  199 + iteration_user = User(item_score)
  200 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  201 + write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
  202 + if hasattr(recommendation,"ranking"):
  203 + results.add_result(recommendation.ranking,sample)
  204 + with open(log_file,'w') as f:
  205 + precision_10 = sum(results.precision[10])/len(results.precision[10])
  206 + f1_10 = sum(results.f1[10])/len(results.f1[10])
  207 + f05_10 = sum(results.f05[10])/len(results.f05[10])
  208 + f.write("# %s\n# %s\n\ncoverage %d\n\n" %
  209 + (label["description"],label["values"],recommendation.size))
  210 + f.write("# best results (recommendation size; metric)\n")
  211 + f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
  212 + (results.best_precision()[0],results.best_precision()[1],
  213 + results.best_f1()[0],results.best_f1()[1],
  214 + results.best_f05()[0],results.best_f05()[1]))
  215 + f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
  216 + (precision_10,f1_10,f05_10))
  217 + precision = results.get_precision_summary()
  218 + recall = results.get_recall_summary()
  219 + f1 = results.get_f1_summary()
  220 + f05 = results.get_f05_summary()
  221 + accuracy = results.get_accuracy_summary()
  222 + plot_summary(precision,recall,f1,f05,accuracy,log_file)
  223 +
  224 +def run_content(user,cfg):
  225 + for strategy in content_based:
  226 + cfg.strategy = strategy
  227 + for size in profile_size:
  228 + cfg.profile_size = size
  229 + run_strategy(cfg,user)
  230 +
  231 +def run_collaborative(user,cfg):
  232 + popcon_desktopapps = cfg.popcon_desktopapps
  233 + popcon_programs = cfg.popcon_programs
  234 + for strategy in collaborative:
  235 + cfg.strategy = strategy
  236 + for k in neighbors:
  237 + cfg.k_neighbors = k
  238 + #for size in popcon_size:
  239 + # if size:
  240 + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
  241 + # cfg.popcon_programs = popcon_programs+"_"+size
  242 + run_strategy(cfg,user)
  243 +
  244 +def run_hybrid(user,cfg):
  245 + popcon_desktopapps = cfg.popcon_desktopapps
  246 + popcon_programs = cfg.popcon_programs
  247 + for strategy in hybrid:
  248 + cfg.strategy = strategy
  249 + for k in neighbors:
  250 + cfg.k_neighbors = k
  251 + #for size in popcon_size:
  252 + # if size:
  253 + # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
  254 + # cfg.popcon_programs = popcon_programs+"_"+size
  255 + for size in profile_size:
  256 + cfg.profile_size = size
  257 + run_strategy(cfg,user)
  258 +
  259 +if __name__ == '__main__':
  260 + #user = LocalSystem()
  261 + #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
  262 +
  263 + cfg = Config()
  264 + user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
  265 + #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
  266 + user.filter_pkg_profile(cfg.pkgs_filter)
  267 + user.maximal_pkg_profile()
  268 +
  269 + if "content" in sys.argv or len(sys.argv)<2:
  270 + run_content(user,cfg)
  271 + if "collaborative" in sys.argv or len(sys.argv)<2:
  272 + run_collaborative(user,cfg)
  273 + if "hybrid" in sys.argv or len(sys.argv)<2:
  274 + run_hybrid(user,cfg)
... ...
src/experiments/experiments.cfg
... ... @@ -1,27 +0,0 @@
1   -[DEFAULT]
2   -repetitions = 1
3   -iterations = 10
4   -path = 'results'
5   -experiment = 'grid'
6   -weight = ['bm25', 'trad']
7   -;profile_size = range(10,100,10)
8   -;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
9   -sample = [0.6, 0.7, 0.8, 0.9]
10   -
11   -[content]
12   -strategy = ['cb','cbt','cbd']
13   -
14   -[clustering]
15   -experiment = 'single'
16   -;iterations = 4
17   -;medoids = range(2,6)
18   -iterations = 6
19   -medoids = [100,500,1000,5000,10000,50000]
20   -;disabled for this experiment
21   -weight = 0
22   -profile_size = 0
23   -sample = 0
24   -
25   -[colaborative]
26   -users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"]
27   -neighbors = range(10,1010,50)
src/experiments/extract-sample-db.py 0 → 100755
... ... @@ -0,0 +1,49 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + sample-popcon - extract a sample from popcon population
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import xapian
  23 +import os
  24 +import random
  25 +import sys
  26 +
  27 +if __name__ == '__main__':
  28 + try:
  29 + sample_file = sys.argv[1]
  30 + popcon = xapian.WritableDatabase(sys.argv[2],xapian.DB_OPEN)
  31 + except:
  32 + print "Usage: extract-sample-db sample_file popcon_index"
  33 + exit(1)
  34 + enquire = xapian.Enquire(popcon)
  35 + print sample_file.split("/")
  36 + new_popcon = xapian.WritableDatabase(sys.argv[2]+"-"+sample_file.split("/")[-1],xapian.DB_CREATE_OR_OVERWRITE)
  37 + print ("Popcon repository size: %d" % popcon.get_doccount())
  38 + for submission in open(sample_file):
  39 + print "ID"+submission.strip()
  40 + query = xapian.Query("ID"+submission.strip())
  41 + enquire.set_query(query)
  42 + mset = enquire.get_mset(0,20)
  43 + for m in mset:
  44 + print "Adding doc %s"%m.docid
  45 + new_popcon.add_document(popcon.get_document(m.docid))
  46 + print "Removing doc %s"%m.docid
  47 + popcon.delete_document(m.docid)
  48 + print ("Popcon repository size: %d" % popcon.get_doccount())
  49 + print ("Popcon repository size: %d" % new_popcon.get_doccount())
... ...
src/experiments/hybrid.py 0 → 100755
... ... @@ -0,0 +1,202 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + hybrid-suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +#hybrid_strategies = ['knnco','knnco_eset']
  35 +
  36 +if __name__ == '__main__':
  37 + if len(sys.argv)<2:
  38 + print "Usage: hybrid strategy sample_file"
  39 + exit(1)
  40 +
  41 + iterations = 20
  42 + profile_size = [10,40,70,100,170,240]
  43 + neighbor_size = [3,10,50,70,100,150,200]
  44 +
  45 + #iterations = 1
  46 + #profile_size = [10,20,30]
  47 + #neighbor_size = [10,20,30]
  48 +
  49 + cfg = Config()
  50 + population_sample = []
  51 + strategy = sys.argv[1]
  52 + sample_file = sys.argv[2]
  53 + sample_str = sample_file.split('/')[-1]
  54 + with open(sample_file,'r') as f:
  55 + for line in f.readlines():
  56 + user_id = line.strip('\n')
  57 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  58 + sample_dir = ("results/hybrid/%s/%s" % (sample_str,strategy))
  59 + if not os.path.exists(sample_dir):
  60 + os.makedirs(sample_dir)
  61 +
  62 + cfg.strategy = strategy
  63 + p_10_summary = {}
  64 + f05_100_summary = {}
  65 + c_10 = {}
  66 + c_100 = {}
  67 +
  68 + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
  69 + graph_10 = {}
  70 + graph_100 = {}
  71 + graph_10_jpg = {}
  72 + graph_100_jpg = {}
  73 + comment_10 = {}
  74 + comment_100 = {}
  75 + for k in neighbor_size:
  76 + graph_10[k] = log_file+("-neighborhood%.3d-010.png"%k)
  77 + graph_100[k] = log_file+("-neighborhood%.3d-100.png"%k)
  78 + graph_10_jpg[k] = graph_10[k].strip(".png")+".jpg"
  79 + graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
  80 + comment_10[k] = graph_10_jpg[k]+".comment"
  81 + comment_100[k] = graph_100_jpg[k]+".comment"
  82 +
  83 + with open(comment_10[k],'w') as f:
  84 + f.write("# %s\n" % sample_str)
  85 + f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
  86 + (cfg.strategy,iterations))
  87 + f.write("# neighborhood\tprofile\tmean_p_10\tdev_p_10\tc_10\n\n")
  88 + with open(comment_100[k],'w') as f:
  89 + f.write("# %s\n" % sample_str)
  90 + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
  91 + (cfg.strategy,iterations))
  92 + f.write("# neighborhood\tprofile\tmean_f05_100\tdev_f05_100\tc_100\n\n")
  93 +
  94 + c_10[k] = {}
  95 + c_100[k] = {}
  96 + p_10_summary[k] = {}
  97 + f05_100_summary[k] = {}
  98 + for size in profile_size:
  99 + c_10[k][size] = set()
  100 + c_100[k][size] = set()
  101 + p_10_summary[k][size] = []
  102 + f05_100_summary[k][size] = []
  103 + with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'w') as f:
  104 + f.write("# %s\n" % sample_str)
  105 + f.write("# strategy %s-neighborhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
  106 + f.write("# p_10\t\tf05_100\n\n")
  107 +
  108 + # main loop per user
  109 + for submission_file in population_sample:
  110 + user = PopconSystem(submission_file)
  111 + user.filter_pkg_profile(cfg.pkgs_filter)
  112 + user.maximal_pkg_profile()
  113 + for k in neighbor_size:
  114 + cfg.k_neighbors = k
  115 + for size in profile_size:
  116 + cfg.profile_size = size
  117 + rec = Recommender(cfg)
  118 + repo_size = rec.items_repository.get_doccount()
  119 + p_10 = []
  120 + f05_100 = []
  121 + for n in range(iterations):
  122 + # Fill sample profile
  123 + profile_len = len(user.pkg_profile)
  124 + item_score = {}
  125 + for pkg in user.pkg_profile:
  126 + item_score[pkg] = user.item_score[pkg]
  127 + sample = {}
  128 + sample_size = int(profile_len*0.9)
  129 + for i in range(sample_size):
  130 + key = random.choice(item_score.keys())
  131 + sample[key] = item_score.pop(key)
  132 + iteration_user = User(item_score)
  133 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  134 + if hasattr(recommendation,"ranking"):
  135 + ranking = recommendation.ranking
  136 + real = RecommendationResult(sample)
  137 + predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
  138 + evaluation = Evaluation(predicted_10,real,repo_size)
  139 + p_10.append(evaluation.run(Precision()))
  140 + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
  141 + evaluation = Evaluation(predicted_100,real,repo_size)
  142 + f05_100.append(evaluation.run(F_score(0.5)))
  143 + c_10[k][size] = c_10[k][size].union(recommendation.ranking[:10])
  144 + c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
  145 + # save summary
  146 + if p_10:
  147 + p_10_summary[k][size].append(numpy.mean(p_10))
  148 + if f05_100:
  149 + f05_100_summary[k][size].append(numpy.mean(f05_100))
  150 +
  151 + with open(log_file+"-neighborhood%.3d-profile%.3d"%(k,size),'a') as f:
  152 + f.write("%.4f\t\t%.4f\n" %
  153 + (numpy.mean(p_10),numpy.mean(f05_100)))
  154 +
  155 + # back to main flow
  156 + coverage_10 = {}
  157 + coverage_100 = {}
  158 + for k in neighbor_size:
  159 + coverage_10[k] = {}
  160 + coverage_100[k] = {}
  161 + with open(comment_10[k],'a') as f:
  162 + for size in profile_size:
  163 + coverage_10[k][size] = len(c_10[k][size])/float(repo_size)
  164 + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
  165 + (k,size,numpy.mean(p_10_summary[k][size]),
  166 + numpy.std(p_10_summary[k][size]),coverage_10[k][size]))
  167 + with open(comment_100[k],'a') as f:
  168 + for size in profile_size:
  169 + coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
  170 + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
  171 + (k,size,numpy.mean(f05_100_summary[k][size]),
  172 + numpy.std(f05_100_summary[k][size]),coverage_100[k][size]))
  173 +
  174 + for k in neighbor_size:
  175 + # plot results summary
  176 + g = Gnuplot.Gnuplot()
  177 + g('set style data lines')
  178 + g('set yrange [0:1.0]')
  179 + g.xlabel('Profile size')
  180 + g.title("Setup: %s-neighborhood%3d (threshold 10)" % (cfg.strategy,k))
  181 + g.plot(Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
  182 + for i in p_10_summary[k].keys()]),title="Precision"),
  183 + Gnuplot.Data(sorted([[i,numpy.mean(p_10_summary[k][i]),numpy.std(p_10_summary[k][i])]
  184 + for i in p_10_summary[k].keys()]),title="Deviation",
  185 + with_="yerrorbar lt 2 pt 6"),
  186 + Gnuplot.Data(sorted([[i,coverage_10[k][i]]
  187 + for i in coverage_10[k].keys()]),title="Coverage"))
  188 + g.hardcopy(graph_10[k],terminal="png")
  189 +
  190 + g = Gnuplot.Gnuplot()
  191 + g('set style data lines')
  192 + g('set yrange [0:1.0]')
  193 + g.xlabel('Profile size')
  194 + g.title("Setup: %s-neighborhood%3d (threshold 100)" % (cfg.strategy,k))
  195 + g.plot(Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
  196 + for i in f05_100_summary[k].keys()]),title="F05"),
  197 + Gnuplot.Data(sorted([[i,numpy.mean(f05_100_summary[k][i]),numpy.std(f05_100_summary[k][i])]
  198 + for i in f05_100_summary[k].keys()]),title="Deviation",
  199 + with_="yerrorbar lt 2 pt 6"),
  200 + Gnuplot.Data(sorted([[i,coverage_100[k][i]]
  201 + for i in coverage_100[k].keys()]),title="Coverage"))
  202 + g.hardcopy(graph_100[k],terminal="png")
... ...
src/experiments/k-suite.py
... ... @@ -1,152 +0,0 @@
1   -#!/usr/bin/env python
2   -"""
3   - recommender suite - recommender experiments suite
4   -"""
5   -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6   -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7   -__license__ = """
8   - This program is free software: you can redistribute it and/or modify
9   - it under the terms of the GNU General Public License as published by
10   - the Free Software Foundation, either version 3 of the License, or
11   - (at your option) any later version.
12   -
13   - This program is distributed in the hope that it will be useful,
14   - but WITHOUT ANY WARRANTY; without even the implied warranty of
15   - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16   - GNU General Public License for more details.
17   -
18   - You should have received a copy of the GNU General Public License
19   - along with this program. If not, see <http://www.gnu.org/licenses/>.
20   -"""
21   -
22   -import sys
23   -sys.path.insert(0,'../')
24   -from config import Config
25   -from data import PopconXapianIndex, PopconSubmission
26   -from recommender import Recommender
27   -from user import LocalSystem, User
28   -from evaluation import *
29   -import logging
30   -import random
31   -import Gnuplot
32   -import numpy
33   -
34   -def plot_roc(p,roc_points,log_file):
35   - g = Gnuplot.Gnuplot()
36   - g('set style data points')
37   - g.xlabel('False Positive Rate')
38   - g.ylabel('True Positive Rate')
39   - g('set xrange [0:1.0]')
40   - g('set yrange [0:1.0]')
41   - g.title("Setup: %s" % log_file.split("/")[-1])
42   - g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
43   - Gnuplot.Data(roc_points,title="k %d"%k))
44   - g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
45   - g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
46   -
47   -class ExperimentResults:
48   - def __init__(self,repo_size):
49   - self.repository_size = repo_size
50   - self.precision = []
51   - self.recall = []
52   - self.fpr = []
53   -
54   - def add_result(self,ranking,sample):
55   - predicted = RecommendationResult(dict.fromkeys(ranking,1))
56   - real = RecommendationResult(sample)
57   - evaluation = Evaluation(predicted,real,self.repository_size)
58   - self.precision.append(evaluation.run(Precision()))
59   - self.recall.append(evaluation.run(Recall()))
60   - self.fpr.append(evaluation.run(FPR()))
61   -
62   - # Average ROC by threshold (whici is the size)
63   - def get_roc_point(self):
64   - tpr = self.recall
65   - fpr = self.fpr
66   - return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
67   -
68   - def get_precision_summary(self):
69   - return sum(self.precision)/len(self.precision)
70   -
71   - def get_recall_summary(self):
72   - return sum(self.recall)/len(self.recall)
73   -
74   -if __name__ == '__main__':
75   - # experiment parameters
76   - threshold = 20
77   - iterations = 30
78   - sample_file = "results/misc-popcon/sample-050-100"
79   - neighbors = [3,5,10,50,100,150,200,300,400,500]
80   - cfg = Config()
81   - cfg.strategy = "knn"
82   - print cfg.popcon_index
83   - sample = []
84   - with open(sample_file,'r') as f:
85   - for line in f.readlines():
86   - user_id = line.strip('\n')
87   - sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
88   - # setup dictionaries and files
89   - roc_points = {}
90   - recommended = {}
91   - precisions = {}
92   - aucs = {}
93   - log_file = "results/k-suite/sample-050-100/%s" % (cfg.strategy)
94   - for k in neighbors:
95   - roc_points[k] = []
96   - recommended[k] = set()
97   - precisions[k] = []
98   - aucs[k] = []
99   - with open(log_file+"-k%.3d"%k,'w') as f:
100   - f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
101   - f.write("# roc_point \tp(20) \tauc\n\n")
102   - # main loop per user
103   - for submission_file in sample:
104   - user = PopconSystem(submission_file)
105   - user.filter_pkg_profile(cfg.pkgs_filter)
106   - user.maximal_pkg_profile()
107   - for k in neighbors:
108   - cfg.k_neighbors = k
109   - rec = Recommender(cfg)
110   - repo_size = rec.items_repository.get_doccount()
111   - results = ExperimentResults(repo_size)
112   - # n iterations for same recommender and user
113   - for n in range(iterations):
114   - # Fill sample profile
115   - profile_size = len(user.pkg_profile)
116   - item_score = {}
117   - for pkg in user.pkg_profile:
118   - item_score[pkg] = user.item_score[pkg]
119   - sample = {}
120   - sample_size = int(profile_size*0.9)
121   - for i in range(sample_size):
122   - key = random.choice(item_score.keys())
123   - sample[key] = item_score.pop(key)
124   - iteration_user = User(item_score)
125   - recommendation = rec.get_recommendation(iteration_user,threshold)
126   - if hasattr(recommendation,"ranking"):
127   - results.add_result(recommendation.ranking,sample)
128   - print "ranking",recommendation.ranking
129   - print "recommended_%d"%k,recommended[k]
130   - recommended[k] = recommended[k].union(recommendation.ranking)
131   - print recommended[k]
132   - # save summary
133   - roc_point = results.get_roc_point()
134   - auc = numpy.trapz(y=[0,roc_point[1],1],x=[0,roc_point[0],1])
135   - p_20 = results.get_precision_summary()
136   - roc_points[k].append(roc_point)
137   - aucs[k].append(auc)
138   - precisions[k].append(p_20)
139   - with open(log_file+"-k%.3d"%k,'a') as f:
140   - f.write("%s \t%.2f \t%.4f\n" % (str(roc_point),p_20,auc))
141   - # back to main flow
142   - with open(log_file,'w') as f:
143   - f.write("# k coverage \tp(20) \tauc\n\n")
144   - for k in neighbors:
145   - print "len_recommended_%d"%k,len(recommended[k])
146   - print "repo_size",repo_size
147   - coverage = len(recommended[k])/float(repo_size)
148   - print coverage
149   - f.write("%d \t%.2f \t%.2f \t%.2fi\n" %
150   - (k,coverage,float(sum(precisions[k]))/len(precisions[k]),
151   - float(sum(aucs[k]))/len(aucs[k])))
152   - plot_roc(k,roc_points[k],log_file)
src/experiments/legacy/clustering-suite.py
... ... @@ -1,51 +0,0 @@
1   -#!/usr/bin/env python
2   -"""
3   - recommender suite - recommender experiments suite
4   -"""
5   -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6   -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7   -__license__ = """
8   - This program is free software: you can redistribute it and/or modify
9   - it under the terms of the GNU General Public License as published by
10   - the Free Software Foundation, either version 3 of the License, or
11   - (at your option) any later version.
12   -
13   - This program is distributed in the hope that it will be useful,
14   - but WITHOUT ANY WARRANTY; without even the implied warranty of
15   - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16   - GNU General Public License for more details.
17   -
18   - You should have received a copy of the GNU General Public License
19   - along with this program. If not, see <http://www.gnu.org/licenses/>.
20   -"""
21   -
22   -import sys
23   -import os
24   -sys.path.insert(0,'../')
25   -from config import Config
26   -from data import PopconXapianIndex, PopconSubmission
27   -from recommender import Recommender
28   -from user import LocalSystem, User
29   -from evaluation import *
30   -import logging
31   -import random
32   -import Gnuplot
33   -
34   -if __name__ == '__main__':
35   -
36   - cfg = Config()
37   - cfg.index_mode = "recluster"
38   - logging.info("Starting clustering experiments")
39   - logging.info("Medoids: %d\t Max popcon:%d" % (cfg.k_medoids,cfg.max_popcon))
40   - cfg.popcon_dir = os.path.expanduser("~/org/popcon.debian.org/popcon-mail/popcon-entries/")
41   - cfg.popcon_index = cfg.popcon_index+("_%dmedoids%dmax" %
42   - (cfg.k_medoids,cfg.max_popcon))
43   - cfg.clusters_dir = cfg.clusters_dir+("_%dmedoids%dmax" %
44   - (cfg.k_medoids,cfg.max_popcon))
45   - pxi = PopconXapianIndex(cfg)
46   - logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion)
47   - # Write clustering log
48   - output = open(("results/clustering/%dmedoids%dmax" % (cfg.k_medoids,cfg.max_popcon)),'w')
49   - output.write("# k_medoids\tmax_popcon\tdispersion\n")
50   - output.write("%d %f\n" % (cfg.k_medoids,cfg.max_popcon,pxi.cluster_dispersion))
51   - output.close()
src/experiments/legacy/experiments.cfg
... ... @@ -1,27 +0,0 @@
1   -[DEFAULT]
2   -repetitions = 1
3   -iterations = 10
4   -path = 'results'
5   -experiment = 'grid'
6   -weight = ['bm25', 'trad']
7   -;profile_size = range(10,100,10)
8   -;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
9   -sample = [0.6, 0.7, 0.8, 0.9]
10   -
11   -[content]
12   -strategy = ['cb','cbt','cbd']
13   -
14   -[clustering]
15   -experiment = 'single'
16   -;iterations = 4
17   -;medoids = range(2,6)
18   -iterations = 6
19   -medoids = [100,500,1000,5000,10000,50000]
20   -;disabled for this experiment
21   -weight = 0
22   -profile_size = 0
23   -sample = 0
24   -
25   -[colaborative]
26   -users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"]
27   -neighbors = range(10,1010,50)
src/experiments/legacy/runner.py
... ... @@ -1,171 +0,0 @@
1   -#!/usr/bin/env python
2   -"""
3   - recommender suite - recommender experiments suite
4   -"""
5   -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6   -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7   -__license__ = """
8   - This program is free software: you can redistribute it and/or modify
9   - it under the terms of the GNU General Public License as published by
10   - the Free Software Foundation, either version 3 of the License, or
11   - (at your option) any later version.
12   -
13   - This program is distributed in the hope that it will be useful,
14   - but WITHOUT ANY WARRANTY; without even the implied warranty of
15   - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16   - GNU General Public License for more details.
17   -
18   - You should have received a copy of the GNU General Public License
19   - along with this program. If not, see <http://www.gnu.org/licenses/>.
20   -"""
21   -
22   -import expsuite
23   -import sys
24   -sys.path.insert(0,'../')
25   -from config import Config
26   -from data import PopconXapianIndex, PopconSubmission
27   -from recommender import Recommender
28   -from user import LocalSystem, User
29   -from evaluation import *
30   -import logging
31   -import random
32   -import Gnuplot
33   -
34   -class ClusteringSuite(expsuite.PyExperimentSuite):
35   - def reset(self, params, rep):
36   - self.cfg = Config()
37   - self.cfg.popcon_index = "../tests/test_data/.sample_pxi"
38   - self.cfg.popcon_dir = "../tests/test_data/popcon_dir"
39   - self.cfg.clusters_dir = "../tests/test_data/clusters_dir"
40   -
41   - if params['name'] == "clustering":
42   - logging.info("Starting 'clustering' experiments suite...")
43   - self.cfg.index_mode = "recluster"
44   -
45   - def iterate(self, params, rep, n):
46   - if params['name'] == "clustering":
47   - logging.info("Running iteration %d" % params['medoids'][n])
48   - self.cfg.k_medoids = params['medoids'][n]
49   - pxi = PopconXapianIndex(self.cfg)
50   - result = {'k_medoids': params['medoids'][n],
51   - 'dispersion': pxi.cluster_dispersion}
52   - else:
53   - result = {}
54   - return result
55   -
56   -class ContentBasedSuite(expsuite.PyExperimentSuite):
57   - def reset(self, params, rep):
58   - if params['name'].startswith("content"):
59   - cfg = Config()
60   - #if the index was not built yet
61   - #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi")
62   - cfg.axi = "data/AppAxi"
63   - cfg.index_mode = "old"
64   - cfg.weight = params['weight']
65   - self.rec = Recommender(cfg)
66   - self.rec.set_strategy(params['strategy'])
67   - self.repo_size = self.rec.items_repository.get_doccount()
68   - self.user = LocalSystem()
69   - self.user.app_pkg_profile(self.rec.items_repository)
70   - self.user.no_auto_pkg_profile()
71   - self.sample_size = int(len(self.user.pkg_profile)*params['sample'])
72   - # iteration should be set to 10 in config file
73   - #self.profile_size = range(10,101,10)
74   -
75   - def iterate(self, params, rep, n):
76   - if params['name'].startswith("content"):
77   - item_score = dict.fromkeys(self.user.pkg_profile,1)
78   - # Prepare partition
79   - sample = {}
80   - for i in range(self.sample_size):
81   - key = random.choice(item_score.keys())
82   - sample[key] = item_score.pop(key)
83   - # Get full recommendation
84   - user = User(item_score)
85   - recommendation = self.rec.get_recommendation(user,self.repo_size)
86   - # Write recall log
87   - recall_file = "results/content/recall/%s-%s-%.2f-%d" % \
88   - (params['strategy'],params['weight'],params['sample'],n)
89   - output = open(recall_file,'w')
90   - output.write("# weight=%s\n" % params['weight'])
91   - output.write("# strategy=%s\n" % params['strategy'])
92   - output.write("# sample=%f\n" % params['sample'])
93   - output.write("\n%d %d %d\n" % \
94   - (self.repo_size,len(item_score),self.sample_size))
95   - notfound = []
96   - ranks = []
97   - for pkg in sample.keys():
98   - if pkg in recommendation.ranking:
99   - ranks.append(recommendation.ranking.index(pkg))
100   - else:
101   - notfound.append(pkg)
102   - for r in sorted(ranks):
103   - output.write(str(r)+"\n")
104   - if notfound:
105   - output.write("Out of recommendation:\n")
106   - for pkg in notfound:
107   - output.write(pkg+"\n")
108   - output.close()
109   - # Plot metrics summary
110   - accuracy = []
111   - precision = []
112   - recall = []
113   - f1 = []
114   - g = Gnuplot.Gnuplot()
115   - g('set style data lines')
116   - g.xlabel('Recommendation size')
117   - for size in range(1,len(recommendation.ranking)+1,100):
118   - predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
119   - real = RecommendationResult(sample)
120   - evaluation = Evaluation(predicted,real,self.repo_size)
121   - accuracy.append([size,evaluation.run(Accuracy())])
122   - precision.append([size,evaluation.run(Precision())])
123   - recall.append([size,evaluation.run(Recall())])
124   - f1.append([size,evaluation.run(F1())])
125   - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
126   - Gnuplot.Data(precision,title="Precision"),
127   - Gnuplot.Data(recall,title="Recall"),
128   - Gnuplot.Data(f1,title="F1"))
129   - g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1)
130   - # Iteration log
131   - result = {'iteration': n,
132   - 'weight': params['weight'],
133   - 'strategy': params['strategy'],
134   - 'accuracy': accuracy[20],
135   - 'precision': precision[20],
136   - 'recall:': recall[20],
137   - 'f1': f1[20]}
138   - return result
139   -
140   -#class CollaborativeSuite(expsuite.PyExperimentSuite):
141   -# def reset(self, params, rep):
142   -# if params['name'].startswith("collaborative"):
143   -#
144   -# def iterate(self, params, rep, n):
145   -# if params['name'].startswith("collaborative"):
146   -# for root, dirs, files in os.walk(self.source_dir):
147   -# for popcon_file in files:
148   -# submission = PopconSubmission(os.path.join(root,popcon_file))
149   -# user = User(submission.packages)
150   -# user.maximal_pkg_profile()
151   -# rec.get_recommendation(user)
152   -# precision = 0
153   -# result = {'weight': params['weight'],
154   -# 'strategy': params['strategy'],
155   -# 'profile_size': self.profile_size[n],
156   -# 'accuracy': accuracy,
157   -# 'precision': precision,
158   -# 'recall:': recall,
159   -# 'f1': }
160   -# else:
161   -# result = {}
162   -# return result
163   -
164   -if __name__ == '__main__':
165   -
166   - if "clustering" in sys.argv or len(sys.argv)<3:
167   - ClusteringSuite().start()
168   - if "content" in sys.argv or len(sys.argv)<3:
169   - ContentBasedSuite().start()
170   - #if "collaborative" in sys.argv or len(sys.argv)<3:
171   - #CollaborativeSuite().start()
src/experiments/pure.py 0 → 100755
... ... @@ -0,0 +1,199 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + profile-suite - experiment different profile sizes
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +if __name__ == '__main__':
  35 + if len(sys.argv)<2:
  36 + print "Usage: pure strategy_category sample_file"
  37 + exit(1)
  38 +
  39 + iterations = 20
  40 + profile_size = [10,20,40,60,80,100,140,170,200,240]
  41 + neighbor_size = [3,5,10,20,30,50,70,100,150,200]
  42 +
  43 + content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  44 + collaborative_strategies = ['knn_eset','knn','knn_plus']
  45 +
  46 + #iterations = 1
  47 + #profile_size = [10,20,30]
  48 + #neighbor_size = [3,5,10,20,30,50]
  49 + #content_strategies = ['cb']
  50 + #collaborative_strategies = ['knn']
  51 +
  52 + strategy_category = sys.argv[1]
  53 + if strategy_category == "content":
  54 + strategies = content_strategies
  55 + sizes = profile_size
  56 + option_str = "profile"
  57 + elif strategy_category == "collaborative":
  58 + strategies = collaborative_strategies
  59 + sizes = neighbor_size
  60 + option_str = "neighborhood"
  61 + else:
  62 + print "Usage: profile-suite strategy_category sample_file"
  63 + exit(1)
  64 +
  65 + cfg = Config()
  66 + population_sample = []
  67 + sample_file = sys.argv[2]
  68 + sample_str = sample_file.split('/')[-1]
  69 + with open(sample_file,'r') as f:
  70 + for line in f.readlines():
  71 + user_id = line.strip('\n')
  72 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  73 + sample_dir = ("results/%s/%s" %
  74 + (strategy_category,sample_str))
  75 + if not os.path.exists(sample_dir):
  76 + os.makedirs(sample_dir)
  77 +
  78 + for strategy in strategies:
  79 + cfg.strategy = strategy
  80 + p_10_summary = {}
  81 + f05_100_summary = {}
  82 + c_10 = {}
  83 + c_100 = {}
  84 +
  85 + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
  86 + graph_10 = log_file+"-10.png"
  87 + graph_100 = log_file+"-100.png"
  88 + graph_10_jpg = graph_10.strip(".png")+".jpg"
  89 + graph_100_jpg = graph_100.strip(".png")+".jpg"
  90 + comment_10 = graph_10_jpg+".comment"
  91 + comment_100 = graph_100_jpg+".comment"
  92 +
  93 + with open(comment_10,'w') as f:
  94 + f.write("# sample %s\n" % sample_str)
  95 + f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
  96 + (cfg.strategy,iterations))
  97 + f.write("# %s\tmean_p_10\tdev_p_10\tc_10\n\n"%option_str)
  98 + with open(comment_100,'w') as f:
  99 + f.write("# sample %s\n" % sample_str)
  100 + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
  101 + (cfg.strategy,iterations))
  102 + f.write("# %s\t\tmean_f05_100\t\tdev_f05_100\t\tc_100\n\n"%option_str)
  103 +
  104 + for size in sizes:
  105 + c_10[size] = set()
  106 + c_100[size] = set()
  107 + p_10_summary[size] = []
  108 + f05_100_summary[size] = []
  109 + with open(log_file+"-%s%.3d"%(option_str,size),'w') as f:
  110 + f.write("# sample %s\n" % sample_str)
  111 + f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size))
  112 + f.write("# p_10\tf05_100\n\n")
  113 +
  114 + # main loop per user
  115 + for submission_file in population_sample:
  116 + user = PopconSystem(submission_file)
  117 + user.filter_pkg_profile(cfg.pkgs_filter)
  118 + user.maximal_pkg_profile()
  119 + for size in sizes:
  120 + cfg.profile_size = size
  121 + cfg.k_neighbors = size
  122 + rec = Recommender(cfg)
  123 + repo_size = rec.items_repository.get_doccount()
  124 + p_10 = []
  125 + f05_100 = []
  126 + for n in range(iterations):
  127 + # Fill sample profile
  128 + profile_len = len(user.pkg_profile)
  129 + item_score = {}
  130 + for pkg in user.pkg_profile:
  131 + item_score[pkg] = user.item_score[pkg]
  132 + sample = {}
  133 + sample_size = int(profile_len*0.9)
  134 + for i in range(sample_size):
  135 + key = random.choice(item_score.keys())
  136 + sample[key] = item_score.pop(key)
  137 + iteration_user = User(item_score)
  138 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  139 + if hasattr(recommendation,"ranking"):
  140 + ranking = recommendation.ranking
  141 + real = RecommendationResult(sample)
  142 + predicted_10 = RecommendationResult(dict.fromkeys(ranking[:10],1))
  143 + evaluation = Evaluation(predicted_10,real,repo_size)
  144 + p_10.append(evaluation.run(Precision()))
  145 + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
  146 + evaluation = Evaluation(predicted_100,real,repo_size)
  147 + f05_100.append(evaluation.run(F_score(0.5)))
  148 + c_10[size] = c_10[size].union(recommendation.ranking[:10])
  149 + c_100[size] = c_100[size].union(recommendation.ranking[:100])
  150 + # save summary
  151 + if p_10:
  152 + p_10_summary[size].append(numpy.mean(p_10))
  153 + if f05_100:
  154 + f05_100_summary[size].append(numpy.mean(f05_100))
  155 +
  156 + with open(log_file+"-%s%.3d"%(option_str,size),'a') as f:
  157 + f.write("%.4f \t%.4f\n" % (numpy.mean(p_10),numpy.mean(f05_100)))
  158 +
  159 + # back to main flow
  160 + coverage_10 = {}
  161 + coverage_100 = {}
  162 + with open(comment_10,'a') as f:
  163 + for size in sizes:
  164 + coverage_10[size] = len(c_10[size])/float(repo_size)
  165 + f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  166 + (size,numpy.mean(p_10_summary[size]),numpy.std(p_10_summary[size]),coverage_10[size]))
  167 + with open(comment_100,'a') as f:
  168 + for size in sizes:
  169 + coverage_100[size] = len(c_100[size])/float(repo_size)
  170 + f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  171 + (size,numpy.mean(f05_100_summary[size]),numpy.std(f05_100_summary[size]),coverage_100[size]))
  172 +
  173 + # plot results summary
  174 + g = Gnuplot.Gnuplot()
  175 + g('set style data lines')
  176 + g('set yrange [0:1.0]')
  177 + g.xlabel('%s size'%option_str.capitalize())
  178 + g.title("Setup: %s (threshold 10)" % cfg.strategy)
  179 + g.plot(Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
  180 + for k in p_10_summary.keys()]),title="Precision"),
  181 + Gnuplot.Data(sorted([[k,numpy.mean(p_10_summary[k]),numpy.std(p_10_summary[k])]
  182 + for k in p_10_summary.keys()]),title="Deviation",
  183 + with_="yerrorbar lt 2 pt 6"),
  184 + Gnuplot.Data(sorted([[k,coverage_10[k]]
  185 + for k in coverage_10.keys()]),title="Coverage"))
  186 + g.hardcopy(graph_10,terminal="png")
  187 + g = Gnuplot.Gnuplot()
  188 + g('set style data lines')
  189 + g('set yrange [0:1.0]')
  190 + g.xlabel('%s size'%option_str.capitalize())
  191 + g.title("Setup: %s (threshold 100)" % cfg.strategy)
  192 + g.plot(Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
  193 + for k in f05_100_summary.keys()]),title="F05"),
  194 + Gnuplot.Data(sorted([[k,numpy.mean(f05_100_summary[k]),numpy.std(f05_100_summary[k])]
  195 + for k in f05_100_summary.keys()]),title="Deviation",
  196 + with_="yerrorbar lt 2 pt 6"),
  197 + Gnuplot.Data(sorted([[k,coverage_100[k]]
  198 + for k in coverage_100.keys()]),title="Coverage"))
  199 + g.hardcopy(graph_100,terminal="png")
... ...
src/experiments/roc-sample.py 0 → 100755
... ... @@ -0,0 +1,240 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +import shutil
  34 +
  35 +def plot_roc(results,log_file,mean=0):
  36 + g = Gnuplot.Gnuplot()
  37 + g('set style data lines')
  38 + g.xlabel('False Positive Rate')
  39 + g.ylabel('True Positive Rate')
  40 + g('set xrange [0:1.0]')
  41 + g('set yrange [0:1.0]')
  42 + g.title("Setup: %s" % log_file.split("/")[-1])
  43 + g('set label "C %.4f" at 0.68,0.2' % results.coverage())
  44 + g('set label "AUC %.4f" at 0.68,0.15' % results.get_auc())
  45 + g('set label "P(10) %.2f +- %.2f" at 0.68,0.10' % (numpy.mean(results.precision[10]),numpy.std(results.precision[10])))
  46 + g('set label "F05(100) %.2f +- %.2f" at 0.68,0.05' % (numpy.mean(results.f05[100]),numpy.std(results.f05[100])))
  47 + if mean==1:
  48 + g.plot(Gnuplot.Data(results.get_roc_points(),title="mean ROC"),
  49 + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
  50 + g.hardcopy(log_file+"-roc-mean.png",terminal="png")
  51 + g.hardcopy(log_file+"-roc-mean.ps",terminal="postscript",enhanced=1,color=1)
  52 + else:
  53 + g.plot(Gnuplot.Data(results.get_roc_points(),title="ROC",with_="xyerrorbars"),
  54 + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
  55 + g.hardcopy(log_file+"-roc.png",terminal="png")
  56 + g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
  57 +
  58 +def get_label(cfg):
  59 + label = {}
  60 + if cfg.strategy in content_based:
  61 + label["description"] = "strategy-profile"
  62 + label["values"] = ("%s-profile%.3d" %
  63 + (cfg.strategy,cfg.profile_size))
  64 + elif cfg.strategy in collaborative:
  65 + label["description"] = "strategy-knn"
  66 + label["values"] = ("%s-k%.3d" %
  67 + (cfg.strategy,cfg.k_neighbors))
  68 + elif cfg.strategy in hybrid:
  69 + label["description"] = "strategy-knn-profile"
  70 + label["values"] = ("%s-k%.3d-profile%.3d" %
  71 + (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
  72 + return label
  73 +
  74 +class ExperimentResults:
  75 + def __init__(self,repo_size):
  76 + self.repository_size = repo_size
  77 + self.precision = {}
  78 + self.recall = {}
  79 + self.fpr = {}
  80 + self.f05 = {}
  81 + self.recommended = {}
  82 + self.thresholds = [1]+range(10,self.repository_size,10)
  83 + for size in self.thresholds:
  84 + self.precision[size] = []
  85 + self.recall[size] = []
  86 + self.fpr[size] = []
  87 + self.f05[size] = []
  88 + self.recommended[size] = set()
  89 +
  90 + def add_result(self,ranking,sample):
  91 + for size in self.thresholds:
  92 + recommendation = ranking[:size]
  93 + self.recommended[size] = self.recommended[size].union(recommendation)
  94 + predicted = RecommendationResult(dict.fromkeys(recommendation,1))
  95 + real = RecommendationResult(sample)
  96 + evaluation = Evaluation(predicted,real,self.repository_size)
  97 + self.precision[size].append(evaluation.run(Precision()))
  98 + self.recall[size].append(evaluation.run(Recall()))
  99 + self.f05[size].append(evaluation.run(F_score(0.5)))
  100 + self.fpr[size].append(evaluation.run(FPR()))
  101 +
  102 + def precision_summary(self):
  103 + return [[size,numpy.mean(self.precision[size])] for size in self.thresholds]
  104 +
  105 + def recall_summary(self):
  106 + return [[size,numpy.mean(self.recall[size])] for size in self.thresholds]
  107 +
  108 + def f05_summary(self):
  109 + return [[size,numpy.mean(self.f05[size])] for size in self.thresholds]
  110 +
  111 + def coverage_summary(self):
  112 + return [[size,self.coverage(size)] for size in self.thresholds]
  113 +
  114 + def coverage(self,size=0):
  115 + if not size:
  116 + size = self.thresholds[-1]
  117 + return len(self.recommended[size])/float(self.repository_size)
  118 +
  119 + def precision(self,size):
  120 + return numpy.mean(results.precision[size])
  121 +
  122 + def get_auc(self):
  123 + roc_points = self.get_roc_points()
  124 + x_roc = [p[0] for p in roc_points]
  125 + y_roc = [p[1] for p in roc_points]
  126 + x_roc.insert(0,0)
  127 + y_roc.insert(0,0)
  128 + x_roc.append(1)
  129 + y_roc.append(1)
  130 + return numpy.trapz(y=y_roc, x=x_roc)
  131 +
  132 + # Average ROC by threshold (= size of recommendation)
  133 + def get_roc_points(self):
  134 + points = []
  135 + for size in self.recall.keys():
  136 + tpr = self.recall[size]
  137 + fpr = self.fpr[size]
  138 + points.append([numpy.mean(fpr),numpy.mean(tpr),numpy.std(fpr),numpy.std(tpr)])
  139 + return sorted(points)
  140 +
  141 +def run_strategy(cfg,sample_file):
  142 + rec = Recommender(cfg)
  143 + repo_size = rec.items_repository.get_doccount()
  144 + results = ExperimentResults(repo_size)
  145 + label = get_label(cfg)
  146 + population_sample = []
  147 + sample_str = sample_file.split('/')[-1]
  148 + with open(sample_file,'r') as f:
  149 + for line in f.readlines():
  150 + user_id = line.strip('\n')
  151 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  152 + sample_dir = ("results/roc-sample/%s" % sample_str)
  153 + if not os.path.exists(sample_dir):
  154 + os.makedirs(sample_dir)
  155 + log_file = os.path.join(sample_dir,label["values"])
  156 +
  157 + # n iterations per population user
  158 + for submission_file in population_sample:
  159 + user = PopconSystem(submission_file)
  160 + user.filter_pkg_profile(cfg.pkgs_filter)
  161 + user.maximal_pkg_profile()
  162 + for n in range(iterations):
  163 + # Fill sample profile
  164 + profile_len = len(user.pkg_profile)
  165 + item_score = {}
  166 + for pkg in user.pkg_profile:
  167 + item_score[pkg] = user.item_score[pkg]
  168 + sample = {}
  169 + sample_size = int(profile_len*0.9)
  170 + for i in range(sample_size):
  171 + key = random.choice(item_score.keys())
  172 + sample[key] = item_score.pop(key)
  173 + iteration_user = User(item_score)
  174 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  175 + if hasattr(recommendation,"ranking"):
  176 + results.add_result(recommendation.ranking,sample)
  177 +
  178 + plot_roc(results,log_file)
  179 + plot_roc(results,log_file,1)
  180 + with open(log_file+"-roc.jpg.comment",'w') as f:
  181 + f.write("# %s\n# %s\n\n" %
  182 + (label["description"],label["values"]))
  183 + f.write("# roc AUC\n%.4f\n\n"%results.get_auc())
  184 + f.write("# threshold\tmean_fpr\tdev_fpr\t\tmean_tpr\tdev_tpr\t\tcoverage\n")
  185 + for size in results.thresholds:
  186 + f.write("%4d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  187 + (size,numpy.mean(results.fpr[size]),
  188 + numpy.std(results.fpr[size]),
  189 + numpy.mean(results.recall[size]),
  190 + numpy.std(results.recall[size]),
  191 + numpy.mean(results.coverage(size))))
  192 +
  193 +def run_content(cfg,sample_file):
  194 + for size in profile_size:
  195 + cfg.profile_size = size
  196 + run_strategy(cfg,sample_file)
  197 +
  198 +def run_collaborative(cfg,sample_file):
  199 + for k in neighbors:
  200 + cfg.k_neighbors = k
  201 + run_strategy(cfg,sample_file)
  202 +
  203 +def run_hybrid(cfg,sample_file):
  204 + for k in neighbors:
  205 + cfg.k_neighbors = k
  206 + for size in profile_size:
  207 + cfg.profile_size = size
  208 + run_strategy(cfg,sample_file)
  209 +
  210 +if __name__ == '__main__':
  211 + if len(sys.argv)<2:
  212 + print "Usage: sample-roc strategy_str [popcon_sample_path]"
  213 + exit(1)
  214 +
  215 + #iterations = 3
  216 + #content_based = ['cb']
  217 + #collaborative = ['knn_eset']
  218 + #hybrid = ['knnco']
  219 + #profile_size = [50,100]
  220 + #neighbors = [50]
  221 + iterations = 20
  222 + content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  223 + collaborative = ['knn_eset','knn','knn_plus']
  224 + hybrid = ['knnco','knnco_eset']
  225 + profile_size = [10,20,50,100,200]
  226 + neighbors = [200]
  227 + #neighbors = [3,10,50,100,200]
  228 + #profile_size = [10,20,40,60,80,100,140,170,200,240]
  229 + #neighbors = [3,5,10,20,30,50,70,100,150,200]
  230 +
  231 + cfg = Config()
  232 + cfg.strategy = sys.argv[1]
  233 + sample_file = sys.argv[2]
  234 +
  235 + if cfg.strategy in content_based:
  236 + run_content(cfg,sample_file)
  237 + if cfg.strategy in collaborative:
  238 + run_collaborative(cfg,sample_file)
  239 + if cfg.strategy in hybrid:
  240 + run_hybrid(cfg,sample_file)
... ...
src/experiments/roc-single.py 0 → 100755
... ... @@ -0,0 +1,269 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +import shutil
  34 +
  35 +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
  36 + # Write recall log
  37 + output = open(("%s-%.2d" % (log_file,n)),'w')
  38 + output.write("# %s-n\n" % label["description"])
  39 + output.write("# %s-%.2d\n" % (label["values"],n))
  40 + output.write("\n# repository profile sample\n%d %d %d\n" % \
  41 + (repo_size,profile_size,len(sample)))
  42 + if hasattr(recommendation,"ranking"):
  43 + notfound = []
  44 + ranks = []
  45 + for pkg in sample.keys():
  46 + if pkg in recommendation.ranking:
  47 + ranks.append(recommendation.ranking.index(pkg))
  48 + else:
  49 + notfound.append(pkg)
  50 + for r in sorted(ranks):
  51 + output.write(str(r)+"\n")
  52 + if notfound:
  53 + output.write("# out of recommendation:\n")
  54 + for pkg in notfound:
  55 + output.write(pkg+"\n")
  56 + output.close()
  57 +
  58 +def plot_summary(results,log_file):
  59 + # Plot metrics summary
  60 + g = Gnuplot.Gnuplot()
  61 + g('set style data lines')
  62 + g('set yrange [0:1.0]')
  63 + g.xlabel('Threshold (recommendation size)')
  64 + g.title("Setup: %s" % log_file.split("/")[-1])
  65 + g.plot(Gnuplot.Data(results.precision_summary(),title="Precision"),
  66 + Gnuplot.Data(results.recall_summary(),title="Recall"),
  67 + Gnuplot.Data(results.f05_summary(),title="F05"),
  68 + Gnuplot.Data(results.coverage_summary(),title="Coverage"))
  69 + g.hardcopy(log_file+".png",terminal="png")
  70 + g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
  71 + g('set logscale x')
  72 + g('replot')
  73 + g.hardcopy(log_file+"-logscale.png",terminal="png")
  74 + g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
  75 +
  76 +def plot_roc(results,log_file):
  77 + g = Gnuplot.Gnuplot()
  78 + g('set style data lines')
  79 + g.xlabel('False Positive Rate')
  80 + g.ylabel('True Positive Rate')
  81 + g('set xrange [0:1.0]')
  82 + g('set yrange [0:1.0]')
  83 + g.title("Setup: %s" % log_file.split("/")[-1])
  84 + g('set label "C %.2f" at 0.8,0.25' % results.coverage())
  85 + g('set label "AUC %.2f" at 0.8,0.2' % results.get_auc())
  86 + g('set label "P(10) %.2f" at 0.8,0.15' % numpy.mean(results.precision[10]))
  87 + g('set label "P(20) %.2f" at 0.8,0.10' % numpy.mean(results.precision[20]))
  88 + g('set label "F05(100) %.2f" at 0.8,0.05' % numpy.mean(results.f05[100]))
  89 + g.plot(Gnuplot.Data(results.get_roc_points(),title="ROC"),
  90 + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
  91 + #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
  92 + g.hardcopy(log_file+"-roc.png",terminal="png")
  93 + g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
  94 +
  95 +def get_label(cfg):
  96 + label = {}
  97 + if cfg.strategy in content_based:
  98 + label["description"] = "strategy-profile"
  99 + label["values"] = ("%s-profile%.3d" %
  100 + (cfg.strategy,cfg.profile_size))
  101 + elif cfg.strategy in collaborative:
  102 + label["description"] = "strategy-knn"
  103 + label["values"] = ("%s-k%.3d" %
  104 + (cfg.strategy,cfg.k_neighbors))
  105 + elif cfg.strategy in hybrid:
  106 + label["description"] = "strategy-knn-profile"
  107 + label["values"] = ("%s-k%.3d-profile%.3d" %
  108 + (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
  109 + return label
  110 +
  111 +class ExperimentResults:
  112 + def __init__(self,repo_size):
  113 + self.repository_size = repo_size
  114 + self.precision = {}
  115 + self.recall = {}
  116 + self.fpr = {}
  117 + self.f05 = {}
  118 + self.recommended = {}
  119 + self.thresholds = [1]+range(10,self.repository_size,10)
  120 + for size in self.thresholds:
  121 + self.precision[size] = []
  122 + self.recall[size] = []
  123 + self.fpr[size] = []
  124 + self.f05[size] = []
  125 + self.recommended[size] = set()
  126 +
  127 + def add_result(self,ranking,sample):
  128 + for size in self.thresholds:
  129 + recommendation = ranking[:size]
  130 + self.recommended[size] = self.recommended[size].union(recommendation)
  131 + predicted = RecommendationResult(dict.fromkeys(recommendation,1))
  132 + real = RecommendationResult(sample)
  133 + evaluation = Evaluation(predicted,real,self.repository_size)
  134 + print evaluation.run(Precision())
  135 + self.precision[size].append(evaluation.run(Precision()))
  136 + self.recall[size].append(evaluation.run(Recall()))
  137 + self.f05[size].append(evaluation.run(F_score(0.5)))
  138 + self.fpr[size].append(evaluation.run(FPR()))
  139 +
  140 + def precision_summary(self):
  141 + return [[size,numpy.mean(self.precision[size])] for size in self.thresholds]
  142 +
  143 + def recall_summary(self):
  144 + return [[size,numpy.mean(self.recall[size])] for size in self.thresholds]
  145 +
  146 + def f05_summary(self):
  147 + return [[size,numpy.mean(self.f05[size])] for size in self.thresholds]
  148 +
  149 + def coverage_summary(self):
  150 + return [[size,self.coverage(size)] for size in self.thresholds]
  151 +
  152 + def coverage(self,size=0):
  153 + if not size:
  154 + size = self.thresholds[-1]
  155 + return len(self.recommended[size])/float(self.repository_size)
  156 +
  157 + def precision(self,size):
  158 + return numpy.mean(results.precision[size])
  159 +
  160 + def get_auc(self):
  161 + roc_points = self.get_roc_points()
  162 + x_roc = [p[0] for p in roc_points]
  163 + y_roc = [p[1] for p in roc_points]
  164 + x_roc.insert(0,0)
  165 + y_roc.insert(0,0)
  166 + x_roc.append(1)
  167 + y_roc.append(1)
  168 + return numpy.trapz(y=y_roc, x=x_roc)
  169 +
  170 + # Average ROC by threshold (= size of recommendation)
  171 + def get_roc_points(self):
  172 + points = []
  173 + for size in self.recall.keys():
  174 + tpr = self.recall[size]
  175 + fpr = self.fpr[size]
  176 + points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
  177 + return sorted(points)
  178 +
  179 +def run_strategy(cfg,user):
  180 + rec = Recommender(cfg)
  181 + repo_size = rec.items_repository.get_doccount()
  182 + results = ExperimentResults(repo_size)
  183 + label = get_label(cfg)
  184 + user_dir = ("results/roc-suite/%s/%s" % (user.user_id[:8],cfg.strategy))
  185 + if not os.path.exists(user_dir):
  186 + os.makedirs(user_dir)
  187 + log_file = os.path.join(user_dir,label["values"])
  188 + for n in range(iterations):
  189 + # Fill sample profile
  190 + profile_len = len(user.pkg_profile)
  191 + item_score = {}
  192 + for pkg in user.pkg_profile:
  193 + item_score[pkg] = user.item_score[pkg]
  194 + sample = {}
  195 + sample_size = int(profile_len*0.9)
  196 + for i in range(sample_size):
  197 + key = random.choice(item_score.keys())
  198 + sample[key] = item_score.pop(key)
  199 + iteration_user = User(item_score)
  200 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  201 + write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
  202 + if hasattr(recommendation,"ranking"):
  203 + results.add_result(recommendation.ranking,sample)
  204 + with open(log_file+"-roc.jpg.comment",'w') as f:
  205 + f.write("# %s\n# %s\n\n" %
  206 + (label["description"],label["values"]))
  207 + f.write("# roc AUC\n%.4f\n\n"%results.get_auc())
  208 + f.write("# threshold\tprecision\trecall\t\tf05\t\tcoverage\n")
  209 + for size in results.thresholds:
  210 + f.write("%4d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  211 + (size,numpy.mean(results.precision[size]),
  212 + numpy.mean(results.recall[size]),
  213 + numpy.mean(results.f05[size]),
  214 + numpy.mean(results.coverage(size))))
  215 + shutil.copy(log_file+"-roc.jpg.comment",log_file+".jpg.comment")
  216 + shutil.copy(log_file+"-roc.jpg.comment",log_file+"-logscale.jpg.comment")
  217 + plot_roc(results,log_file)
  218 + plot_summary(results,log_file)
  219 +
  220 +def run_content(user,cfg):
  221 + for size in profile_size:
  222 + cfg.profile_size = size
  223 + run_strategy(cfg,user)
  224 +
  225 +def run_collaborative(user,cfg):
  226 + for k in neighbors:
  227 + cfg.k_neighbors = k
  228 + run_strategy(cfg,user)
  229 +
  230 +def run_hybrid(user,cfg):
  231 + for k in neighbors:
  232 + cfg.k_neighbors = k
  233 + for size in profile_size:
  234 + cfg.profile_size = size
  235 + run_strategy(cfg,user)
  236 +
  237 +if __name__ == '__main__':
  238 + if len(sys.argv)<2:
  239 + print "Usage: roc-suite strategy_str [popcon_submission_path]"
  240 + exit(1)
  241 +
  242 + #iterations = 3
  243 + #content_based = ['cb']
  244 + #collaborative = ['knn_eset']
  245 + #hybrid = ['knnco']
  246 + #profile_size = [50,100]
  247 + #neighbors = [50]
  248 + iterations = 20
  249 + content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  250 + collaborative = ['knn_eset','knn','knn_plus']
  251 + hybrid = ['knnco','knnco_eset']
  252 + profile_size = [10,20,40,60,80,100,140,170,200,240]
  253 + neighbors = [3,5,10,20,30,50,70,100,150,200]
  254 +
  255 + cfg = Config()
  256 + cfg.strategy = sys.argv[1]
  257 +
  258 + #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
  259 + user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
  260 + #user = PopconSystem(sys.argv[1])
  261 + user.filter_pkg_profile(cfg.pkgs_filter)
  262 + user.maximal_pkg_profile()
  263 +
  264 + if cfg.strategy in content_based:
  265 + run_content(user,cfg)
  266 + if cfg.strategy in collaborative:
  267 + run_collaborative(user,cfg)
  268 + if cfg.strategy in hybrid:
  269 + run_hybrid(user,cfg)
... ...
src/experiments/roc-suite.py
... ... @@ -1,328 +0,0 @@
1   -#!/usr/bin/env python
2   -"""
3   - recommender suite - recommender experiments suite
4   -"""
5   -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6   -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7   -__license__ = """
8   - This program is free software: you can redistribute it and/or modify
9   - it under the terms of the GNU General Public License as published by
10   - the Free Software Foundation, either version 3 of the License, or
11   - (at your option) any later version.
12   -
13   - This program is distributed in the hope that it will be useful,
14   - but WITHOUT ANY WARRANTY; without even the implied warranty of
15   - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16   - GNU General Public License for more details.
17   -
18   - You should have received a copy of the GNU General Public License
19   - along with this program. If not, see <http://www.gnu.org/licenses/>.
20   -"""
21   -
22   -import sys
23   -sys.path.insert(0,'../')
24   -from config import Config
25   -from data import PopconXapianIndex, PopconSubmission
26   -from recommender import Recommender
27   -from user import LocalSystem, User
28   -from evaluation import *
29   -import logging
30   -import random
31   -import Gnuplot
32   -import numpy
33   -
34   -#iterations = 3
35   -#sample_proportions = [0.9]
36   -#weighting = [('bm25',1.2)]
37   -#collaborative = ['knn_eset']
38   -#content_based = ['cb']
39   -#hybrid = ['knnco']
40   -#profile_size = [50,100]
41   -#popcon_size = ["1000"]
42   -#neighbors = [50]
43   -
44   -iterations = 30
45   -sample_proportions = [0.9]
46   -weighting = [('bm25',1.0),('bm25',1.2),('bm25',2.0),('trad',0)]
47   -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
48   -collaborative = ['knn_eset','knn','knn_plus']
49   -hybrid = ['knnco','knnco_eset']
50   -profile_size = range(20,200,20)
51   -neighbors = range(10,510,50)
52   -
53   -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
54   - # Write recall log
55   - output = open(("%s-%.2d" % (log_file,n)),'w')
56   - output.write("# %s-n\n" % label["description"])
57   - output.write("# %s-%.2d\n" % (label["values"],n))
58   - output.write("\n# repository profile sample\n%d %d %d\n" % \
59   - (repo_size,profile_size,len(sample)))
60   - if hasattr(recommendation,"ranking"):
61   - notfound = []
62   - ranks = []
63   - for pkg in sample.keys():
64   - if pkg in recommendation.ranking:
65   - ranks.append(recommendation.ranking.index(pkg))
66   - else:
67   - notfound.append(pkg)
68   - for r in sorted(ranks):
69   - output.write(str(r)+"\n")
70   - if notfound:
71   - output.write("# out of recommendation:\n")
72   - for pkg in notfound:
73   - output.write(pkg+"\n")
74   - output.close()
75   -
76   -def plot_roc(roc_points,auc,eauc,c,p,log_file):
77   - g = Gnuplot.Gnuplot()
78   - g('set style data lines')
79   - g.xlabel('False Positive Rate')
80   - g.ylabel('True Positive Rate')
81   - g('set xrange [0:1.0]')
82   - g('set yrange [0:1.0]')
83   - g.title("Setup: %s" % log_file.split("/")[-1])
84   - g('set label "C %.2f" at 0.8,0.25' % c)
85   - g('set label "P(20) %.2f" at 0.8,0.2' % p)
86   - g('set label "AUC %.4f" at 0.8,0.15' % auc)
87   - g('set label "EAUC %.4f" at 0.8,0.1' % eauc)
88   - g.plot(Gnuplot.Data(roc_points,title="ROC"),
89   - Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
90   - Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
91   - g.hardcopy(log_file+"-roc.png",terminal="png")
92   - g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
93   -
94   -def plot_summary(precision,recall,f1,f05,accuracy,log_file):
95   - # Plot metrics summary
96   - g = Gnuplot.Gnuplot()
97   - g('set style data lines')
98   - g.xlabel('Recommendation size')
99   - g.title("Setup: %s" % log_file.split("/")[-1])
100   - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
101   - Gnuplot.Data(precision,title="Precision"),
102   - Gnuplot.Data(recall,title="Recall"),
103   - Gnuplot.Data(f1,title="F_1"),
104   - Gnuplot.Data(f05,title="F_0.5"))
105   - g.hardcopy(log_file+".png",terminal="png")
106   - g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
107   - g('set logscale x')
108   - g('replot')
109   - g.hardcopy(log_file+"-logscale.png",terminal="png")
110   - g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
111   -
112   -def get_label(cfg,sample_proportion):
113   - label = {}
114   - if cfg.strategy in content_based:
115   - label["description"] = "strategy-filter-profile-k1_bm25"
116   - label["values"] = ("%s-profile%.3d-%s-kbm%.1f" %
117   - (cfg.strategy,cfg.profile_size,
118   - cfg.pkgs_filter.split("/")[-1],
119   - cfg.bm25_k1))
120   - elif cfg.strategy in collaborative:
121   - label["description"] = "strategy-knn-filter-k1_bm25"
122   - label["values"] = ("%s-k%.3d-%s-kbm%.1f" %
123   - (cfg.strategy,cfg.k_neighbors,
124   - cfg.pkgs_filter.split("/")[-1],
125   - cfg.bm25_k1))
126   - elif cfg.strategy in hybrid:
127   - label["description"] = "strategy-knn-filter-profile-k1_bm25"
128   - label["values"] = ("%s-k%.3d-profile%.3d-%s-kbm%.1f" %
129   - (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
130   - cfg.pkgs_filter.split("/")[-1],
131   - cfg.bm25_k1))
132   - else:
133   - print "Unknown strategy"
134   - return label
135   -
136   -class ExperimentResults:
137   - def __init__(self,repo_size):
138   - self.repository_size = repo_size
139   - self.accuracy = {}
140   - self.precision = {}
141   - self.recall = {}
142   - self.f1 = {}
143   - self.f05 = {}
144   - self.fpr = {}
145   - #points = [1]+range(10,200,10)+range(200,self.repository_size,100)
146   - points = [1]+range(10,self.repository_size,10)
147   - self.recommended = set()
148   - for size in points:
149   - self.accuracy[size] = []
150   - self.precision[size] = []
151   - self.recall[size] = []
152   - self.f1[size] = []
153   - self.f05[size] = []
154   - self.fpr[size] = []
155   -
156   - def add_result(self,ranking,sample):
157   - print "len_recommended", len(self.recommended)
158   - print "len_rank", len(ranking)
159   - self.recommended = self.recommended.union(ranking)
160   - print "len_recommended", len(self.recommended)
161   - # get data only for point
162   - for size in self.accuracy.keys():
163   - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
164   - real = RecommendationResult(sample)
165   - evaluation = Evaluation(predicted,real,self.repository_size)
166   - #self.accuracy[size].append(evaluation.run(Accuracy()))
167   - self.precision[size].append(evaluation.run(Precision()))
168   - self.recall[size].append(evaluation.run(Recall()))
169   - #self.f1[size].append(evaluation.run(F_score(1)))
170   - #self.f05[size].append(evaluation.run(F_score(0.5)))
171   - self.fpr[size].append(evaluation.run(FPR()))
172   -
173   - # Average ROC by threshold (whici is the size)
174   - def get_roc_points(self):
175   - points = []
176   - for size in self.recall.keys():
177   - tpr = self.recall[size]
178   - fpr = self.fpr[size]
179   - points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
180   - return sorted(points)
181   -
182   - def get_precision_summary(self):
183   - summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
184   - return sorted(summary)
185   -
186   - def get_recall_summary(self):
187   - summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
188   - return sorted(summary)
189   -
190   - def get_f1_summary(self):
191   - summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
192   - return sorted(summary)
193   -
194   - def get_f05_summary(self):
195   - summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
196   - return sorted(summary)
197   -
198   - def get_accuracy_summary(self):
199   - summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
200   - return sorted(summary)
201   -
202   - def best_precision(self):
203   - size = max(self.precision, key = lambda x: max(self.precision[x]) and x>10)
204   - return (size,max(self.precision[size]))
205   -
206   - def best_f1(self):
207   - size = max(self.f1, key = lambda x: max(self.f1[x]))
208   - return (size,max(self.f1[size]))
209   -
210   - def best_f05(self):
211   - size = max(self.f05, key = lambda x: max(self.f05[x]))
212   - return (size,max(self.f05[size]))
213   -
214   -def run_strategy(cfg,user):
215   - for weight in weighting:
216   - cfg.weight = weight[0]
217   - cfg.bm25_k1 = weight[1]
218   - rec = Recommender(cfg)
219   - repo_size = rec.items_repository.get_doccount()
220   - for proportion in sample_proportions:
221   - results = ExperimentResults(repo_size)
222   - label = get_label(cfg,proportion)
223   - #log_file = "results/20110906/4a67a295/"+label["values"]
224   - log_file = "results/"+label["values"]
225   - for n in range(iterations):
226   - # Fill sample profile
227   - profile_size = len(user.pkg_profile)
228   - item_score = {}
229   - for pkg in user.pkg_profile:
230   - item_score[pkg] = user.item_score[pkg]
231   - sample = {}
232   - sample_size = int(profile_size*proportion)
233   - for i in range(sample_size):
234   - key = random.choice(item_score.keys())
235   - sample[key] = item_score.pop(key)
236   - iteration_user = User(item_score)
237   - recommendation = rec.get_recommendation(iteration_user,repo_size)
238   - #write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
239   - if hasattr(recommendation,"ranking"):
240   - results.add_result(recommendation.ranking,sample)
241   - with open(log_file,'w') as f:
242   - roc_points = results.get_roc_points()
243   - x_coord = [p[0] for p in roc_points]
244   - y_coord = [p[1] for p in roc_points]
245   - auc = numpy.trapz(y=y_coord, x=x_coord)
246   - eauc = (auc+
247   - numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
248   - numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
249   - precision_20 = sum(results.precision[10])/len(results.precision[10])
250   - print results.recommended
251   - print "len",len(results.recommended)
252   - coverage = len(results.recommended)/float(repo_size)
253   - print "repo_size: ", float(repo_size)
254   - print coverage
255   - exit(1)
256   - #f1_10 = sum(results.f1[10])/len(results.f1[10])
257   - #f05_10 = sum(results.f05[10])/len(results.f05[10])
258   - f.write("# %s\n# %s\n\n" %
259   - (label["description"],label["values"]))
260   - f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
261   - (coverage,precision_20,auc,eauc))
262   - #f.write("# best results (recommendation size; metric)\n")
263   - #f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
264   - # (results.best_precision()[0],results.best_precision()[1],
265   - # results.best_f1()[0],results.best_f1()[1],
266   - # results.best_f05()[0],results.best_f05()[1]))
267   - #f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
268   - # (precision_10,f1_10,f05_10))
269   - #precision = results.get_precision_summary()
270   - #recall = results.get_recall_summary()
271   - #f1 = results.get_f1_summary()
272   - #f05 = results.get_f05_summary()
273   - #accuracy = results.get_accuracy_summary()
274   - #plot_summary(precision,recall,f1,f05,accuracy,log_file)
275   - plot_roc(roc_points,auc,eauc,coverage,precision_20,log_file)
276   -
277   -def run_content(user,cfg):
278   - for strategy in content_based:
279   - cfg.strategy = strategy
280   - for size in profile_size:
281   - cfg.profile_size = size
282   - run_strategy(cfg,user)
283   -
284   -def run_collaborative(user,cfg):
285   - popcon_desktopapps = cfg.popcon_desktopapps
286   - popcon_programs = cfg.popcon_programs
287   - for strategy in collaborative:
288   - cfg.strategy = strategy
289   - for k in neighbors:
290   - cfg.k_neighbors = k
291   - #for size in popcon_size:
292   - # if size:
293   - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
294   - # cfg.popcon_programs = popcon_programs+"_"+size
295   - run_strategy(cfg,user)
296   -
297   -def run_hybrid(user,cfg):
298   - popcon_desktopapps = cfg.popcon_desktopapps
299   - popcon_programs = cfg.popcon_programs
300   - for strategy in hybrid:
301   - cfg.strategy = strategy
302   - for k in neighbors:
303   - cfg.k_neighbors = k
304   - #for size in popcon_size:
305   - # if size:
306   - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
307   - # cfg.popcon_programs = popcon_programs+"_"+size
308   - for size in profile_size:
309   - cfg.profile_size = size
310   - run_strategy(cfg,user)
311   -
312   -if __name__ == '__main__':
313   - #user = LocalSystem()
314   - #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
315   -
316   - cfg = Config()
317   - #user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
318   - user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
319   - #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a5834eb2aba6b6f17312239e0761c70")
320   - user.filter_pkg_profile(cfg.pkgs_filter)
321   - user.maximal_pkg_profile()
322   -
323   - if "content" in sys.argv or len(sys.argv)<2:
324   - run_content(user,cfg)
325   - if "collaborative" in sys.argv or len(sys.argv)<2:
326   - run_collaborative(user,cfg)
327   - if "hybrid" in sys.argv or len(sys.argv)<2:
328   - run_hybrid(user,cfg)
src/experiments/runner.py
... ... @@ -1,171 +0,0 @@
1   -#!/usr/bin/env python
2   -"""
3   - recommender suite - recommender experiments suite
4   -"""
5   -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6   -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7   -__license__ = """
8   - This program is free software: you can redistribute it and/or modify
9   - it under the terms of the GNU General Public License as published by
10   - the Free Software Foundation, either version 3 of the License, or
11   - (at your option) any later version.
12   -
13   - This program is distributed in the hope that it will be useful,
14   - but WITHOUT ANY WARRANTY; without even the implied warranty of
15   - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16   - GNU General Public License for more details.
17   -
18   - You should have received a copy of the GNU General Public License
19   - along with this program. If not, see <http://www.gnu.org/licenses/>.
20   -"""
21   -
22   -import expsuite
23   -import sys
24   -sys.path.insert(0,'../')
25   -from config import Config
26   -from data import PopconXapianIndex, PopconSubmission
27   -from recommender import Recommender
28   -from user import LocalSystem, User
29   -from evaluation import *
30   -import logging
31   -import random
32   -import Gnuplot
33   -
34   -class ClusteringSuite(expsuite.PyExperimentSuite):
35   - def reset(self, params, rep):
36   - self.cfg = Config()
37   - self.cfg.popcon_index = "../tests/test_data/.sample_pxi"
38   - self.cfg.popcon_dir = "../tests/test_data/popcon_dir"
39   - self.cfg.clusters_dir = "../tests/test_data/clusters_dir"
40   -
41   - if params['name'] == "clustering":
42   - logging.info("Starting 'clustering' experiments suite...")
43   - self.cfg.index_mode = "recluster"
44   -
45   - def iterate(self, params, rep, n):
46   - if params['name'] == "clustering":
47   - logging.info("Running iteration %d" % params['medoids'][n])
48   - self.cfg.k_medoids = params['medoids'][n]
49   - pxi = PopconXapianIndex(self.cfg)
50   - result = {'k_medoids': params['medoids'][n],
51   - 'dispersion': pxi.cluster_dispersion}
52   - else:
53   - result = {}
54   - return result
55   -
56   -class ContentBasedSuite(expsuite.PyExperimentSuite):
57   - def reset(self, params, rep):
58   - if params['name'].startswith("content"):
59   - cfg = Config()
60   - #if the index was not built yet
61   - #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi")
62   - cfg.axi = "data/AppAxi"
63   - cfg.index_mode = "old"
64   - cfg.weight = params['weight']
65   - self.rec = Recommender(cfg)
66   - self.rec.set_strategy(params['strategy'])
67   - self.repo_size = self.rec.items_repository.get_doccount()
68   - self.user = LocalSystem()
69   - self.user.app_pkg_profile(self.rec.items_repository)
70   - self.user.no_auto_pkg_profile()
71   - self.sample_size = int(len(self.user.pkg_profile)*params['sample'])
72   - # iteration should be set to 10 in config file
73   - #self.profile_size = range(10,101,10)
74   -
75   - def iterate(self, params, rep, n):
76   - if params['name'].startswith("content"):
77   - item_score = dict.fromkeys(self.user.pkg_profile,1)
78   - # Prepare partition
79   - sample = {}
80   - for i in range(self.sample_size):
81   - key = random.choice(item_score.keys())
82   - sample[key] = item_score.pop(key)
83   - # Get full recommendation
84   - user = User(item_score)
85   - recommendation = self.rec.get_recommendation(user,self.repo_size)
86   - # Write recall log
87   - recall_file = "results/content/recall/%s-%s-%.2f-%d" % \
88   - (params['strategy'],params['weight'],params['sample'],n)
89   - output = open(recall_file,'w')
90   - output.write("# weight=%s\n" % params['weight'])
91   - output.write("# strategy=%s\n" % params['strategy'])
92   - output.write("# sample=%f\n" % params['sample'])
93   - output.write("\n%d %d %d\n" % \
94   - (self.repo_size,len(item_score),self.sample_size))
95   - notfound = []
96   - ranks = []
97   - for pkg in sample.keys():
98   - if pkg in recommendation.ranking:
99   - ranks.append(recommendation.ranking.index(pkg))
100   - else:
101   - notfound.append(pkg)
102   - for r in sorted(ranks):
103   - output.write(str(r)+"\n")
104   - if notfound:
105   - output.write("Out of recommendation:\n")
106   - for pkg in notfound:
107   - output.write(pkg+"\n")
108   - output.close()
109   - # Plot metrics summary
110   - accuracy = []
111   - precision = []
112   - recall = []
113   - f1 = []
114   - g = Gnuplot.Gnuplot()
115   - g('set style data lines')
116   - g.xlabel('Recommendation size')
117   - for size in range(1,len(recommendation.ranking)+1,100):
118   - predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
119   - real = RecommendationResult(sample)
120   - evaluation = Evaluation(predicted,real,self.repo_size)
121   - accuracy.append([size,evaluation.run(Accuracy())])
122   - precision.append([size,evaluation.run(Precision())])
123   - recall.append([size,evaluation.run(Recall())])
124   - f1.append([size,evaluation.run(F1())])
125   - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
126   - Gnuplot.Data(precision,title="Precision"),
127   - Gnuplot.Data(recall,title="Recall"),
128   - Gnuplot.Data(f1,title="F1"))
129   - g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1)
130   - # Iteration log
131   - result = {'iteration': n,
132   - 'weight': params['weight'],
133   - 'strategy': params['strategy'],
134   - 'accuracy': accuracy[20],
135   - 'precision': precision[20],
136   - 'recall:': recall[20],
137   - 'f1': f1[20]}
138   - return result
139   -
140   -#class CollaborativeSuite(expsuite.PyExperimentSuite):
141   -# def reset(self, params, rep):
142   -# if params['name'].startswith("collaborative"):
143   -#
144   -# def iterate(self, params, rep, n):
145   -# if params['name'].startswith("collaborative"):
146   -# for root, dirs, files in os.walk(self.source_dir):
147   -# for popcon_file in files:
148   -# submission = PopconSubmission(os.path.join(root,popcon_file))
149   -# user = User(submission.packages)
150   -# user.maximal_pkg_profile()
151   -# rec.get_recommendation(user)
152   -# precision = 0
153   -# result = {'weight': params['weight'],
154   -# 'strategy': params['strategy'],
155   -# 'profile_size': self.profile_size[n],
156   -# 'accuracy': accuracy,
157   -# 'precision': precision,
158   -# 'recall:': recall,
159   -# 'f1': }
160   -# else:
161   -# result = {}
162   -# return result
163   -
164   -if __name__ == '__main__':
165   -
166   - if "clustering" in sys.argv or len(sys.argv)<3:
167   - ClusteringSuite().start()
168   - if "content" in sys.argv or len(sys.argv)<3:
169   - ContentBasedSuite().start()
170   - #if "collaborative" in sys.argv or len(sys.argv)<3:
171   - #CollaborativeSuite().start()
src/experiments/sample-popcon-arch.py 0 → 100755
... ... @@ -0,0 +1,44 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + sample-popcon-arch - extract a sample of a specific arch
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +import sys
  22 +sys.path.insert(0,'../')
  23 +import xapian
  24 +import os
  25 +import random
  26 +import sys
  27 +from user import RandomPopcon
  28 +
  29 +if __name__ == '__main__':
  30 + try:
  31 + size = int(sys.argv[1])
  32 + arch = sys.argv[2]
  33 + popcon_dir = sys.argv[3]
  34 + pkgs_filter = sys.argv[4]
  35 + except:
  36 + print "Usage: sample-popcon-arch size arch popcon_dir pkgs_filter"
  37 + exit(1)
  38 +
  39 + sample_file = ("results/misc-popcon/sample-%s-%d" % (arch,size))
  40 + with open(sample_file,'w') as f:
  41 + for n in range(1,size+1):
  42 + user = RandomPopcon(popcon_dir,arch,pkgs_filter)
  43 + f.write(user.user_id+'\n')
  44 + print "sample",n
... ...
src/experiments/strategies-suite.py
... ... @@ -1,274 +0,0 @@
1   -#!/usr/bin/env python
2   -"""
3   - recommender suite - recommender experiments suite
4   -"""
5   -__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6   -__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
7   -__license__ = """
8   - This program is free software: you can redistribute it and/or modify
9   - it under the terms of the GNU General Public License as published by
10   - the Free Software Foundation, either version 3 of the License, or
11   - (at your option) any later version.
12   -
13   - This program is distributed in the hope that it will be useful,
14   - but WITHOUT ANY WARRANTY; without even the implied warranty of
15   - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16   - GNU General Public License for more details.
17   -
18   - You should have received a copy of the GNU General Public License
19   - along with this program. If not, see <http://www.gnu.org/licenses/>.
20   -"""
21   -
22   -import sys
23   -sys.path.insert(0,'../')
24   -from config import Config
25   -from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
26   -from recommender import Recommender
27   -from user import LocalSystem, User
28   -from evaluation import *
29   -import logging
30   -import random
31   -import Gnuplot
32   -
33   -#iterations = 3
34   -#sample_proportions = [0.9]
35   -#weighting = [('bm25',1.2)]
36   -#collaborative = ['knn']
37   -#content_based = []
38   -#hybrid = ['knnco']
39   -#profile_size = [50,100]
40   -#popcon_size = ["1000"]
41   -#neighbors = [50]
42   -
43   -iterations = 10
44   -sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
45   -weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
46   -content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
47   -collaborative = ['knn_eset','knn','knn_plus']
48   -hybrid = ['knnco','knnco_eset']
49   -
50   -profile_size = range(20,100,20)
51   -#popcon_size = [1000,10000,50000,'full']
52   -neighbors = range(10,510,50)
53   -
54   -def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
55   - # Write recall log
56   - output = open(("%s-%d" % (log_file,n)),'w')
57   - output.write("# %s-n\n" % label["description"])
58   - output.write("# %s-%d\n" % (label["values"],n))
59   - output.write("\n%d %d %d\n" % \
60   - (repo_size,profile_size,len(sample)))
61   - if hasattr(recommendation,"ranking"):
62   - notfound = []
63   - ranks = []
64   - for pkg in sample.keys():
65   - if pkg in recommendation.ranking:
66   - ranks.append(recommendation.ranking.index(pkg))
67   - else:
68   - notfound.append(pkg)
69   - for r in sorted(ranks):
70   - output.write(str(r)+"\n")
71   - if notfound:
72   - output.write("Out of recommendation:\n")
73   - for pkg in notfound:
74   - output.write(pkg+"\n")
75   - output.close()
76   -
77   -def plot_summary(precision,recall,f1,f05,accuracy,log_file):
78   - # Plot metrics summary
79   - g = Gnuplot.Gnuplot()
80   - g('set style data lines')
81   - g.xlabel('Recommendation size')
82   - g.title("Setup: %s" % log_file.split("/")[-1])
83   - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
84   - Gnuplot.Data(precision,title="Precision"),
85   - Gnuplot.Data(recall,title="Recall"),
86   - Gnuplot.Data(f1,title="F_1"),
87   - Gnuplot.Data(f05,title="F_0.5"))
88   - g.hardcopy(log_file+".png",terminal="png")
89   - g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
90   - g('set logscale x')
91   - g('replot')
92   - g.hardcopy(log_file+"-logscale.png",terminal="png")
93   - g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
94   -
95   -def get_label(cfg,sample_proportion):
96   - label = {}
97   - if cfg.strategy in content_based:
98   - label["description"] = "strategy-filter-profile-k1_bm25-sample"
99   - label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
100   - (cfg.strategy,cfg.profile_size,
101   - cfg.pkgs_filter.split("/")[-1],
102   - cfg.bm25_k1,sample_proportion))
103   - elif cfg.strategy in collaborative:
104   - label["description"] = "strategy-knn-filter-k1_bm25-sample"
105   - label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
106   - (cfg.strategy,cfg.k_neighbors,
107   - cfg.pkgs_filter.split("/")[-1],
108   - cfg.bm25_k1,sample_proportion))
109   - elif cfg.strategy in hybrid:
110   - label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
111   - label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
112   - (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
113   - cfg.pkgs_filter.split("/")[-1],
114   - cfg.bm25_k1,sample_proportion))
115   - else:
116   - print "Unknown strategy"
117   - return label
118   -
119   -class ExperimentResults:
120   - def __init__(self,repo_size):
121   - self.repository_size = repo_size
122   - self.accuracy = {}
123   - self.precision = {}
124   - self.recall = {}
125   - self.f1 = {}
126   - self.f05 = {}
127   - points = [1]+range(10,200,10)+range(200,self.repository_size,100)
128   - for size in points:
129   - self.accuracy[size] = []
130   - self.precision[size] = []
131   - self.recall[size] = []
132   - self.f1[size] = []
133   - self.f05[size] = []
134   -
135   - def add_result(self,ranking,sample):
136   - for size in self.accuracy.keys():
137   - predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
138   - real = RecommendationResult(sample)
139   - evaluation = Evaluation(predicted,real,self.repository_size)
140   - self.accuracy[size].append(evaluation.run(Accuracy()))
141   - self.precision[size].append(evaluation.run(Precision()))
142   - self.recall[size].append(evaluation.run(Recall()))
143   - self.f1[size].append(evaluation.run(F_score(1)))
144   - self.f05[size].append(evaluation.run(F_score(0.5)))
145   -
146   - def get_precision_summary(self):
147   - summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
148   - return sorted(summary)
149   -
150   - def get_recall_summary(self):
151   - summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
152   - return sorted(summary)
153   -
154   - def get_f1_summary(self):
155   - summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
156   - return sorted(summary)
157   -
158   - def get_f05_summary(self):
159   - summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
160   - return sorted(summary)
161   -
162   - def get_accuracy_summary(self):
163   - summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
164   - return sorted(summary)
165   -
166   - def best_precision(self):
167   - size = max(self.precision, key = lambda x: max(self.precision[x]))
168   - return (size,max(self.precision[size]))
169   -
170   - def best_f1(self):
171   - size = max(self.f1, key = lambda x: max(self.f1[x]))
172   - return (size,max(self.f1[size]))
173   -
174   - def best_f05(self):
175   - size = max(self.f05, key = lambda x: max(self.f05[x]))
176   - return (size,max(self.f05[size]))
177   -
178   -def run_strategy(cfg,user):
179   - for weight in weighting:
180   - cfg.weight = weight[0]
181   - cfg.bm25_k1 = weight[1]
182   - rec = Recommender(cfg)
183   - repo_size = rec.items_repository.get_doccount()
184   - for proportion in sample_proportions:
185   - results = ExperimentResults(repo_size)
186   - label = get_label(cfg,proportion)
187   - log_file = "results/strategies/"+label["values"]
188   - for n in range(iterations):
189   - # Fill sample profile
190   - profile_size = len(user.pkg_profile)
191   - item_score = {}
192   - for pkg in user.pkg_profile:
193   - item_score[pkg] = user.item_score[pkg]
194   - sample = {}
195   - sample_size = int(profile_size*proportion)
196   - for i in range(sample_size):
197   - key = random.choice(item_score.keys())
198   - sample[key] = item_score.pop(key)
199   - iteration_user = User(item_score)
200   - recommendation = rec.get_recommendation(iteration_user,repo_size)
201   - write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
202   - if hasattr(recommendation,"ranking"):
203   - results.add_result(recommendation.ranking,sample)
204   - with open(log_file,'w') as f:
205   - precision_10 = sum(results.precision[10])/len(results.precision[10])
206   - f1_10 = sum(results.f1[10])/len(results.f1[10])
207   - f05_10 = sum(results.f05[10])/len(results.f05[10])
208   - f.write("# %s\n# %s\n\ncoverage %d\n\n" %
209   - (label["description"],label["values"],recommendation.size))
210   - f.write("# best results (recommendation size; metric)\n")
211   - f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
212   - (results.best_precision()[0],results.best_precision()[1],
213   - results.best_f1()[0],results.best_f1()[1],
214   - results.best_f05()[0],results.best_f05()[1]))
215   - f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
216   - (precision_10,f1_10,f05_10))
217   - precision = results.get_precision_summary()
218   - recall = results.get_recall_summary()
219   - f1 = results.get_f1_summary()
220   - f05 = results.get_f05_summary()
221   - accuracy = results.get_accuracy_summary()
222   - plot_summary(precision,recall,f1,f05,accuracy,log_file)
223   -
224   -def run_content(user,cfg):
225   - for strategy in content_based:
226   - cfg.strategy = strategy
227   - for size in profile_size:
228   - cfg.profile_size = size
229   - run_strategy(cfg,user)
230   -
231   -def run_collaborative(user,cfg):
232   - popcon_desktopapps = cfg.popcon_desktopapps
233   - popcon_programs = cfg.popcon_programs
234   - for strategy in collaborative:
235   - cfg.strategy = strategy
236   - for k in neighbors:
237   - cfg.k_neighbors = k
238   - #for size in popcon_size:
239   - # if size:
240   - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
241   - # cfg.popcon_programs = popcon_programs+"_"+size
242   - run_strategy(cfg,user)
243   -
244   -def run_hybrid(user,cfg):
245   - popcon_desktopapps = cfg.popcon_desktopapps
246   - popcon_programs = cfg.popcon_programs
247   - for strategy in hybrid:
248   - cfg.strategy = strategy
249   - for k in neighbors:
250   - cfg.k_neighbors = k
251   - #for size in popcon_size:
252   - # if size:
253   - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
254   - # cfg.popcon_programs = popcon_programs+"_"+size
255   - for size in profile_size:
256   - cfg.profile_size = size
257   - run_strategy(cfg,user)
258   -
259   -if __name__ == '__main__':
260   - #user = LocalSystem()
261   - #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
262   -
263   - cfg = Config()
264   - user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
265   - #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
266   - user.filter_pkg_profile(cfg.pkgs_filter)
267   - user.maximal_pkg_profile()
268   -
269   - if "content" in sys.argv or len(sys.argv)<2:
270   - run_content(user,cfg)
271   - if "collaborative" in sys.argv or len(sys.argv)<2:
272   - run_collaborative(user,cfg)
273   - if "hybrid" in sys.argv or len(sys.argv)<2:
274   - run_hybrid(user,cfg)
src/user.py
... ... @@ -111,7 +111,7 @@ class User:
111 111 """
112 112 Define a user of a recommender.
113 113 """
114   - def __init__(self,item_score,user_id=0,demo_profiles_set=0):
  114 + def __init__(self,item_score,user_id=0,arch=0,demo_profiles_set=0):
115 115 """
116 116 Set initial user attributes. pkg_profile gets the whole set of items,
117 117 a random user_id is set if none was provided and the demographic
... ... @@ -119,6 +119,7 @@ class User:
119 119 """
120 120 self.item_score = item_score
121 121 self.pkg_profile = self.items()
  122 + self.arch = arch
122 123  
123 124 if user_id:
124 125 self.user_id = user_id
... ... @@ -272,21 +273,28 @@ class User:
272 273 return self.pkg_profile
273 274  
274 275 class RandomPopcon(User):
275   - def __init__(self,submissions_dir,pkgs_filter=0):
  276 + def __init__(self,submissions_dir,arch=0,pkgs_filter=0):
276 277 """
277 278 Set initial parameters.
278 279 """
279 280 len_profile = 0
280   - while len_profile < 100:
  281 + match_arch = False
  282 + while len_profile < 100 or not match_arch:
281 283 path = random.choice([os.path.join(root, submission) for
282 284 root, dirs, files in os.walk(submissions_dir)
283 285 for submission in files])
284 286 user = PopconSystem(path)
  287 + print arch
  288 + print user.arch
  289 + if arch and user.arch==arch:
  290 + match_arch = True
  291 + print "match"
285 292 if pkgs_filter:
286 293 user.filter_pkg_profile(pkgs_filter)
287 294 len_profile = len(user.pkg_profile)
  295 + print "p",len_profile
288 296 submission = data.PopconSubmission(path)
289   - User.__init__(self,submission.packages,submission.user_id)
  297 + User.__init__(self,submission.packages,submission.user_id,submission.arch)
290 298  
291 299 class PopconSystem(User):
292 300 def __init__(self,path,user_id=0):
... ... @@ -296,7 +304,7 @@ class PopconSystem(User):
296 304 submission = data.PopconSubmission(path)
297 305 if not user_id:
298 306 user_id = submission.user_id
299   - User.__init__(self,submission.packages,user_id)
  307 + User.__init__(self,submission.packages,user_id,submission.arch)
300 308  
301 309 class PkgsListSystem(User):
302 310 def __init__(self,pkgs_list_or_file,user_id=0):
... ...