Commit 6d9bfe1d7e44ab36152b3b97fd12208d56f27dfb
Exists in
master
and in
1 other branch
Merge branch 'master' of github.com:tassia/AppRecommender
Showing
28 changed files
with
1920 additions
and
62 deletions
Show diff stats
src/bin/cross_validation.py
... | ... | @@ -37,7 +37,7 @@ if __name__ == '__main__': |
37 | 37 | #user = LocalSystem() |
38 | 38 | #user = RandomPopcon(cfg.popcon_dir) |
39 | 39 | #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps")) |
40 | - user = PopconSystem("/home/tassia/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623") | |
40 | + user = PopconSystem(os.path.expanduser("~/.app-recommender/popcon-entries/00/0001166d0737c6dffb083071e5ee69f5")) | |
41 | 41 | user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps")) |
42 | 42 | user.maximal_pkg_profile() |
43 | 43 | begin_time = datetime.datetime.now() |
... | ... | @@ -48,7 +48,7 @@ if __name__ == '__main__': |
48 | 48 | metrics.append(F_score(0.5)) |
49 | 49 | metrics.append(Accuracy()) |
50 | 50 | metrics.append(FPR()) |
51 | - validation = CrossValidation(0.9,10,rec,metrics,1) | |
51 | + validation = CrossValidation(0.9,20,rec,metrics,0.005) | |
52 | 52 | validation.run(user) |
53 | 53 | print validation |
54 | 54 | ... | ... |
... | ... | @@ -0,0 +1,42 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + AppRecommender - A GNU/Linux application recommender | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import os | |
23 | +import sys | |
24 | +sys.path.insert(0,'../') | |
25 | +import xapian | |
26 | + | |
27 | +if __name__ == '__main__': | |
28 | + if len(sys.argv)<2: | |
29 | + print "Usage: get_axipkgs index_path" | |
30 | + exit(1) | |
31 | + | |
32 | + axi_path = sys.argv[1] | |
33 | + axi = xapian.Database(axi_path) | |
34 | + for n in range(1,axi.get_lastdocid()): | |
35 | + doc = 0 | |
36 | + try: | |
37 | + doc = axi.get_document(n) | |
38 | + except: | |
39 | + pass | |
40 | + if doc: | |
41 | + xp_terms = [t.term for t in doc.termlist() if t.term.startswith("XP")] | |
42 | + print xp_terms[0].lstrip('XP') | ... | ... |
src/bin/get_desktop.sh
1 | 1 | #!/usr/bin/env bash |
2 | 2 | # |
3 | -# get_desktop.sh - get packages which have desktop files | |
3 | +# get_desktop.sh - get packages which have desktop files | |
4 | +# | |
5 | +# DEPRECATED: use get_axipkgs.py to get this info from axi | |
4 | 6 | |
5 | 7 | cd /usr/share/app-install/desktop |
6 | 8 | sed -ne 's/X-AppInstall-Package=//p' * | sort -u | grep -v kdelibs | grep -v libfm-gtk0 | ... | ... |
src/bin/get_pkgs_inst.py
1 | 1 | #!/usr/bin/env python |
2 | 2 | # |
3 | 3 | # get_pkgs_inst.py - get tuple (package,installation) from popcon results file |
4 | +# | |
5 | +# results_file: org/popcon.debian.org/popcon-mail/results | |
4 | 6 | |
7 | +import sys | |
5 | 8 | from operator import itemgetter |
9 | + | |
6 | 10 | if __name__ == '__main__': |
11 | + if len(sys.argv)<2: | |
12 | + print "Usage: get_pkgs_inst popcon_results_path" | |
13 | + exit(1) | |
14 | + | |
15 | + results_path = sys.argv[1] | |
7 | 16 | pkgs_inst = {} |
8 | - with open("/root/org/popcon.debian.org/popcon-mail/results") as results: | |
17 | + with open(results_path) as results: | |
9 | 18 | for line in results: |
10 | 19 | if line.startswith("Package"): |
11 | 20 | fields = line.split() |
12 | 21 | inst = int(fields[2])+int(fields[3])+int(fields[4]) |
13 | - if inst > 20: | |
14 | - pkgs_inst[fields[1]] = inst | |
22 | + pkgs_inst[fields[1]] = inst | |
15 | 23 | sorted_by_inst = sorted(pkgs_inst.items(), key=itemgetter(1)) |
16 | 24 | for pkg, inst in sorted_by_inst: |
17 | 25 | print pkg, inst | ... | ... |
... | ... | @@ -0,0 +1,77 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + indexer.py - generate xapian indexes to be used as items and users | |
4 | + repositories | |
5 | +""" | |
6 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
7 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
8 | +__license__ = """ | |
9 | + This program is free software: you can redistribute it and/or modify | |
10 | + it under the terms of the GNU General Public License as published by | |
11 | + the Free Software Foundation, either version 3 of the License, or | |
12 | + (at your option) any later version. | |
13 | + | |
14 | + This program is distributed in the hope that it will be useful, | |
15 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | + GNU General Public License for more details. | |
18 | + | |
19 | + You should have received a copy of the GNU General Public License | |
20 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
21 | +""" | |
22 | + | |
23 | +import os | |
24 | +import sys | |
25 | +sys.path.insert(0,'../') | |
26 | +import datetime | |
27 | + | |
28 | +from config import Config | |
29 | +from error import Error | |
30 | +import data | |
31 | +import xapian | |
32 | + | |
33 | +if __name__ == '__main__': | |
34 | + axi_path = "/var/lib/apt-xapian-index/index" | |
35 | + axi = xapian.Database(axi_path) | |
36 | + base_dir = os.path.expanduser("~/.app-recommender/") | |
37 | + | |
38 | + begin_time = datetime.datetime.now() | |
39 | + | |
40 | + # axi sample based on the pkgs sample provided by command line | |
41 | + if "sample" in sys.argv: | |
42 | + print ("Sample package indexing started at %s" % begin_time) | |
43 | + if len(sys.argv) > 2: | |
44 | + pkgs_filter = sys.argv[2] | |
45 | + else: | |
46 | + print "Usage: indexer axi_sample pkgs_sample_file" | |
47 | + exit(1) | |
48 | + with open(pkgs_filter) as valid: | |
49 | + pkgs_list = [line.strip() for line in valid] | |
50 | + filter_str = pkgs_filter.split("/")[-1] | |
51 | + index = data.SampleAptXapianIndex(pkgs_list,axi, | |
52 | + os.path.join(base_dir,"axi_"+filter_str)) | |
53 | + print ("Axi size: %d" % axi.get_doccount()) | |
54 | + print ("Packages list length: %d" % len(pkgs_list)) | |
55 | + print ("Sample index size: %d" % | |
56 | + index.get_doccount()) | |
57 | + | |
58 | + # axi filtered by terms provided by command line | |
59 | + if "filter" in sys.argv: | |
60 | + print ("Filtered package indexing started at %s" % begin_time) | |
61 | + if len(sys.argv) > 2: | |
62 | + terms = sys.argv[2:] | |
63 | + else: | |
64 | + print ("Usage: indexer axi_filter term [additional terms]") | |
65 | + exit(1) | |
66 | + terms_str = "_".join([t.split("::")[-1] for t in terms]) | |
67 | + index = data.FilteredXapianIndex(terms,axi, | |
68 | + os.path.join(base_dir,"axi_"+terms_str)) | |
69 | + print ("Axi size: %d" % axi.get_doccount()) | |
70 | + print ("Terms filter: %s" % terms) | |
71 | + print ("Filtered index size: %d" % | |
72 | + index.get_doccount()) | |
73 | + | |
74 | + end_time = datetime.datetime.now() | |
75 | + print ("Indexing completed at %s" % end_time) | |
76 | + delta = end_time - begin_time | |
77 | + print ("Time elapsed: %d seconds." % delta.seconds) | ... | ... |
... | ... | @@ -0,0 +1,52 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + popindex.py - generate a popcon index to be used by the recommender as the | |
4 | + users repository, based on filters provided by config | |
5 | +""" | |
6 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
7 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
8 | +__license__ = """ | |
9 | + This program is free software: you can redistribute it and/or modify | |
10 | + it under the terms of the GNU General Public License as published by | |
11 | + the Free Software Foundation, either version 3 of the License, or | |
12 | + (at your option) any later version. | |
13 | + | |
14 | + This program is distributed in the hope that it will be useful, | |
15 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | + GNU General Public License for more details. | |
18 | + | |
19 | + You should have received a copy of the GNU General Public License | |
20 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
21 | +""" | |
22 | +import os | |
23 | +import sys | |
24 | +sys.path.insert(0,'../') | |
25 | +import logging | |
26 | +import datetime | |
27 | + | |
28 | +from config import Config | |
29 | +from data import FilteredPopconXapianIndex | |
30 | + | |
31 | +if __name__ == '__main__': | |
32 | + base_dir = os.path.expanduser("~/.app-recommender/") | |
33 | + axi_path = os.path.join(base_dir,"axi_XD") | |
34 | + path = os.path.join(base_dir,"popcon_XD") | |
35 | + popcon_dir = os.path.join(base_dir,"popcon-entries") | |
36 | + tags_filter = os.path.join(base_dir,"filters/debtags") | |
37 | + | |
38 | + # set up config for logging | |
39 | + cfg = Config() | |
40 | + | |
41 | + begin_time = datetime.datetime.now() | |
42 | + logging.info("Popcon indexing started at %s" % begin_time) | |
43 | + # use config file or command line options | |
44 | + index = FilteredPopconXapianIndex(path,popcon_dir,axi_path,tags_filter) | |
45 | + | |
46 | + end_time = datetime.datetime.now() | |
47 | + logging.info("Popcon indexing completed at %s" % end_time) | |
48 | + logging.info("Number of documents (submissions): %d" % | |
49 | + index.get_doccount()) | |
50 | + | |
51 | + delta = end_time - begin_time | |
52 | + logging.info("Time elapsed: %d seconds." % delta.seconds) | ... | ... |
src/config.py
... | ... | @@ -40,7 +40,7 @@ class Config(Singleton): |
40 | 40 | ## general options |
41 | 41 | self.debug = 0 |
42 | 42 | self.verbose = 1 |
43 | - self.output = "log" | |
43 | + self.output = "apprec.log" | |
44 | 44 | |
45 | 45 | ## data_source options |
46 | 46 | self.base_dir = os.path.expanduser("~/.app-recommender/") |
... | ... | @@ -103,13 +103,14 @@ class Config(Singleton): |
103 | 103 | print " -f, --filtersdir=PATH Path to filters directory" |
104 | 104 | print " -b, --pkgsfilter=FILTER File containing packages to be considered for recommendations" |
105 | 105 | print " -a, --axi=PATH Path to apt-xapian-index" |
106 | - print " -e, --dde=URL DDE url" | |
107 | 106 | print " -p, --popconindex=PATH Path to popcon index" |
108 | - print " -m, --popcondir=PATH Path to popcon submissions dir" | |
109 | - print " -u, --indexmode=MODE 'old'|'reindex'|'cluster'|'recluster'" | |
110 | - print " -l, --clustersdir=PATH Path to popcon clusters dir" | |
111 | - print " -c, --medoids=k Number of medoids for clustering" | |
112 | - print " -x, --maxpopcon=k Number of submissions to be considered" | |
107 | + print " -e, --dde=URL DDE url" | |
108 | + # deprecated options | |
109 | + #print " -m, --popcondir=PATH Path to popcon submissions dir" | |
110 | + #print " -u, --indexmode=MODE 'old'|'reindex'|'cluster'|'recluster'" | |
111 | + #print " -l, --clustersdir=PATH Path to popcon clusters dir" | |
112 | + #print " -c, --medoids=k Number of medoids for clustering" | |
113 | + #print " -x, --maxpopcon=k Number of submissions to be considered" | |
113 | 114 | print "" |
114 | 115 | print " [ recommender ]" |
115 | 116 | print " -w, --weight=OPTION Search weighting scheme" |
... | ... | @@ -123,11 +124,19 @@ class Config(Singleton): |
123 | 124 | print " bm25 = bm25 weighting scheme" |
124 | 125 | print "" |
125 | 126 | print " [ strategy options ] " |
126 | - print " cb = content-based " | |
127 | - print " cbt = content-based using only tags as content " | |
128 | - print " cbd = content-based using only package descriptions as content " | |
129 | - print " col = collaborative " | |
130 | - print " colct = collaborative through tags content " | |
127 | + print " cb = content-based, mixed profile" | |
128 | + print " cbt = content-based, tags only profile" | |
129 | + print " cbd = content-based, description terms only profile" | |
130 | + print " cbh = content-based, half-half profile" | |
131 | + print " cb_eset = cb with eset profiling" | |
132 | + print " cbt_eset = cbt with eset profiling" | |
133 | + print " cbd_eset = cbd_eset with eset profiling" | |
134 | + print " cbh_eset = cbh with eset profiling" | |
135 | + print " knn = collaborative, tf-idf knn" | |
136 | + print " knn_plus = collaborative, tf-idf weighted knn" | |
137 | + print " knn_eset = collaborative, eset knn" | |
138 | + print " knnco = collaborative through content" | |
139 | + print " knnco_eset = collaborative through content, eset recommendation" | |
131 | 140 | |
132 | 141 | def read_option(self, section, option): |
133 | 142 | """ | ... | ... |
src/data.py
... | ... | @@ -30,12 +30,26 @@ import shutil |
30 | 30 | import apt |
31 | 31 | import re |
32 | 32 | import operator |
33 | +import urllib | |
34 | +import simplejson as json | |
33 | 35 | |
34 | 36 | from error import Error |
35 | 37 | from singleton import Singleton |
36 | 38 | from dissimilarity import * |
37 | 39 | from config import Config |
38 | 40 | |
41 | +def axi_get_pkgs(axi): | |
42 | + pkgs_names = [] | |
43 | + for docid in range(1,axi.get_lastdocid()+1): | |
44 | + try: | |
45 | + doc = axi.get_document(docid) | |
46 | + except: | |
47 | + pass | |
48 | + docterms_XP = [t.term for t in doc.termlist() | |
49 | + if t.term.startswith("XP")] | |
50 | + pkgs_names.append(docterms_XP[0].lstrip('XP')) | |
51 | + return pkgs_names | |
52 | + | |
39 | 53 | def axi_search_pkgs(axi,pkgs_list): |
40 | 54 | terms = ["XP"+item for item in pkgs_list] |
41 | 55 | query = xapian.Query(xapian.Query.OP_OR, terms) |
... | ... | @@ -110,30 +124,39 @@ def tfidf_plus(index,docs,content_filter): |
110 | 124 | variance = sum([(p-mean)*(p-mean) for p in population])/len(population) |
111 | 125 | standard_deviation = math.sqrt(variance) |
112 | 126 | for d in docs: |
113 | - normalized_weigths[d.docid] = d.weight/standard_deviation | |
127 | + if standard_deviation>1: | |
128 | + # values between [0-1] would cause the opposite effect | |
129 | + normalized_weigths[d.docid] = d.weight/standard_deviation | |
130 | + else: | |
131 | + normalized_weigths[d.docid] = d.weight | |
114 | 132 | return tfidf_weighting(index,docs,content_filter,normalized_weigths) |
115 | 133 | |
116 | -class AppAptXapianIndex(xapian.WritableDatabase): | |
134 | +class FilteredXapianIndex(xapian.WritableDatabase): | |
117 | 135 | """ |
118 | - Data source for application packages information | |
136 | + Filtered Xapian Index | |
119 | 137 | """ |
120 | - def __init__(self,axi_path,path): | |
138 | + def __init__(self,terms,index_path,path): | |
121 | 139 | xapian.WritableDatabase.__init__(self,path, |
122 | 140 | xapian.DB_CREATE_OR_OVERWRITE) |
123 | - axi = xapian.Database(axi_path) | |
124 | - logging.info("AptXapianIndex size: %d" % axi.get_doccount()) | |
125 | - for docid in range(1,axi.get_lastdocid()+1): | |
141 | + index = xapian.Database(index_path) | |
142 | + for docid in range(1,index.get_lastdocid()+1): | |
126 | 143 | try: |
127 | - doc = axi.get_document(docid) | |
128 | - allterms = [term.term for term in doc.termlist()] | |
129 | - if "XTrole::program" in allterms: | |
144 | + doc = index.get_document(docid) | |
145 | + docterms = [term.term for term in doc.termlist()] | |
146 | + tagged = False | |
147 | + for t in terms: | |
148 | + if t in docterms: | |
149 | + tagged = True | |
150 | + if tagged: | |
130 | 151 | self.add_document(doc) |
131 | 152 | logging.info("Added doc %d." % docid) |
132 | 153 | else: |
133 | 154 | logging.info("Discarded doc %d." % docid) |
134 | 155 | except: |
135 | 156 | logging.info("Doc %d not found in axi." % docid) |
136 | - logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." % | |
157 | + logging.info("Filter: %s" % terms) | |
158 | + logging.info("Index size: %d" % index.get_doccount()) | |
159 | + logging.info("Filtered Index size: %d (lastdocid: %d)." % | |
137 | 160 | (self.get_doccount(), self.get_lastdocid())) |
138 | 161 | |
139 | 162 | def __str__(self): |
... | ... | @@ -186,13 +209,13 @@ class DebianPackage(): |
186 | 209 | if pkg_version.record.has_key('Conflicts'): |
187 | 210 | self.conflicts = pkg_version.record['Conflicts'] |
188 | 211 | if pkg_version.record.has_key('Replaces'): |
189 | - self.conflicts = pkg_version.record['Replaces'] | |
212 | + self.replaces = pkg_version.record['Replaces'] | |
190 | 213 | if pkg_version.record.has_key('Provides'): |
191 | 214 | self.provides = pkg_version.record['Provides'] |
192 | 215 | |
193 | 216 | def load_details_from_dde(self,dde_server,dde_port): |
194 | - json_data = json.load(urllib.urlopen("http://%s:%s/q/udd/packages/all/%s?t=json" | |
195 | - % dde_server,dde_port,self.name)) | |
217 | + json_data = json.load(urllib.urlopen("http://%s:%d/q/udd/packages/prio-debian-sid/%s?t=json" | |
218 | + % (dde_server,dde_port,self.name))) | |
196 | 219 | |
197 | 220 | self.maintainer = json_data['r']['maintainer'] |
198 | 221 | self.version = json_data['r']['version'] |
... | ... | @@ -200,27 +223,27 @@ class DebianPackage(): |
200 | 223 | self.description = self.format_description(json_data['r']['long_description']) |
201 | 224 | self.section = json_data['r']['section'] |
202 | 225 | if json_data['r']['homepage']: |
203 | - self.conflicts = json_data['r']['homepage'] | |
226 | + self.homepage = json_data['r']['homepage'] | |
204 | 227 | if json_data['r']['tag']: |
205 | 228 | self.tags = self.debtags_list_to_dict(json_data['r']['tag']) |
206 | 229 | if json_data['r']['depends']: |
207 | 230 | self.depends = json_data['r']['depends'] |
208 | 231 | if json_data['r']['pre_depends']: |
209 | - self.conflicts = json_data['r']['pre_depends'] | |
232 | + self.predepends = json_data['r']['pre_depends'] | |
210 | 233 | if json_data['r']['recommends']: |
211 | - self.conflicts = json_data['r']['recommends'] | |
234 | + self.recommends = json_data['r']['recommends'] | |
212 | 235 | if json_data['r']['suggests']: |
213 | - self.conflicts = json_data['r']['suggests'] | |
236 | + self.suggests = json_data['r']['suggests'] | |
214 | 237 | if json_data['r']['conflicts']: |
215 | 238 | self.conflicts = json_data['r']['conflicts'] |
216 | 239 | if json_data['r']['replaces']: |
217 | - self.conflicts = json_data['r']['replaces'] | |
240 | + self.replaces = json_data['r']['replaces'] | |
218 | 241 | if json_data['r']['provides']: |
219 | - self.conflicts = json_data['r']['provides'] | |
242 | + self.provides = json_data['r']['provides'] | |
220 | 243 | self.popcon_insts = json_data['r']['popcon']['insts'] |
221 | 244 | |
222 | 245 | def format_description(self,description): |
223 | - return description.replace('.\n','').replace('\n','<br />') | |
246 | + return description.replace(' .\n','<br />').replace('\n','<br />') | |
224 | 247 | |
225 | 248 | def debtags_str_to_dict(self, debtags_str): |
226 | 249 | debtags_list = [tag.rstrip(",") for tag in debtags_str.split()] |
... | ... | @@ -281,6 +304,7 @@ class PopconSubmission(): |
281 | 304 | for line in submission: |
282 | 305 | if line.startswith("POPULARITY"): |
283 | 306 | self.user_id = line.split()[2].lstrip("ID:") |
307 | + self.arch = line.split()[3].lstrip("ARCH:") | |
284 | 308 | elif not line.startswith("END-POPULARITY"): |
285 | 309 | data = line.rstrip('\n').split() |
286 | 310 | if len(data) > 2: |
... | ... | @@ -304,6 +328,82 @@ class PopconSubmission(): |
304 | 328 | elif data[4] == '<RECENT-CTIME>': |
305 | 329 | self.packages[pkg] = 8 |
306 | 330 | |
331 | +class FilteredPopconXapianIndex(xapian.WritableDatabase): | |
332 | + """ | |
333 | + Data source for popcon submissions defined as a xapian database. | |
334 | + """ | |
335 | + def __init__(self,path,popcon_dir,axi_path,tags_filter): | |
336 | + """ | |
337 | + Set initial attributes. | |
338 | + """ | |
339 | + self.axi = xapian.Database(axi_path) | |
340 | + self.path = os.path.expanduser(path) | |
341 | + self.popcon_dir = os.path.expanduser(popcon_dir) | |
342 | + self.valid_pkgs = axi_get_pkgs(self.axi) | |
343 | + logging.debug("Considering %d valid packages" % len(self.valid_pkgs)) | |
344 | + with open(tags_filter) as valid_tags: | |
345 | + self.valid_tags = [line.strip() for line in valid_tags | |
346 | + if not line.startswith("#")] | |
347 | + logging.debug("Considering %d valid tags" % len(self.valid_tags)) | |
348 | + if not os.path.exists(self.popcon_dir): | |
349 | + os.makedirs(self.popcon_dir) | |
350 | + if not os.listdir(self.popcon_dir): | |
351 | + logging.critical("Popcon dir seems to be empty.") | |
352 | + raise Error | |
353 | + | |
354 | + # set up directory | |
355 | + shutil.rmtree(self.path,1) | |
356 | + os.makedirs(self.path) | |
357 | + try: | |
358 | + logging.info("Indexing popcon submissions from \'%s\'" % | |
359 | + self.popcon_dir) | |
360 | + logging.info("Creating new xapian index at \'%s\'" % | |
361 | + self.path) | |
362 | + xapian.WritableDatabase.__init__(self,self.path, | |
363 | + xapian.DB_CREATE_OR_OVERWRITE) | |
364 | + except xapian.DatabaseError as e: | |
365 | + logging.critical("Could not create popcon xapian index.") | |
366 | + logging.critical(str(e)) | |
367 | + raise Error | |
368 | + | |
369 | + # build new index | |
370 | + doc_count = 0 | |
371 | + for root, dirs, files in os.walk(self.popcon_dir): | |
372 | + for popcon_file in files: | |
373 | + submission = PopconSubmission(os.path.join(root, popcon_file)) | |
374 | + doc = xapian.Document() | |
375 | + submission_pkgs = submission.get_filtered(self.valid_pkgs) | |
376 | + if len(submission_pkgs) < 10: | |
377 | + logging.debug("Low profile popcon submission \'%s\' (%d)" % | |
378 | + (submission.user_id,len(submission_pkgs))) | |
379 | + else: | |
380 | + doc.set_data(submission.user_id) | |
381 | + doc.add_term("ID"+submission.user_id) | |
382 | + doc.add_term("ARCH"+submission.arch) | |
383 | + logging.debug("Parsing popcon submission \'%s\'" % | |
384 | + submission.user_id) | |
385 | + for pkg,freq in submission_pkgs.items(): | |
386 | + tags = axi_search_pkg_tags(self.axi,pkg) | |
387 | + # if the package was found in axi | |
388 | + if tags: | |
389 | + doc.add_term("XP"+pkg,freq) | |
390 | + # if the package has tags associated with it | |
391 | + if not tags == "notags": | |
392 | + for tag in tags: | |
393 | + if tag.lstrip("XT") in self.valid_tags: | |
394 | + doc.add_term(tag,freq) | |
395 | + doc_id = self.add_document(doc) | |
396 | + doc_count += 1 | |
397 | + logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) | |
398 | + # python garbage collector | |
399 | + gc.collect() | |
400 | + # flush to disk database changes | |
401 | + try: | |
402 | + self.commit() | |
403 | + except: | |
404 | + self.flush() # deprecated function, used for compatibility with old lib version | |
405 | + | |
406 | +# Deprecated class, must be reviewed | |
307 | 407 | class PopconXapianIndex(xapian.WritableDatabase): |
308 | 408 | """ |
309 | 409 | Data source for popcon submissions defined as a singleton xapian database. | ... | ... |
src/evaluation.py
... | ... | @@ -140,6 +140,29 @@ class FPR(Metric): |
140 | 140 | return (float(len(evaluation.false_positive))/ |
141 | 141 | evaluation.real_negative_len) |
142 | 142 | |
143 | +class MCC(Metric): | |
144 | + """ | |
145 | + Matthews correlation coefficient. | |
146 | + """ | |
147 | + def __init__(self): | |
148 | + """ | |
149 | + Set metric description. | |
150 | + """ | |
151 | + self.desc = " MCC " | |
152 | + | |
153 | + def run(self,evaluation): | |
154 | + """ | |
155 | + Compute metric. | |
156 | + """ | |
157 | + VP = len(evaluation.true_positive) | |
158 | + FP = len(evaluation.false_positive) | |
159 | + FN = len(evaluation.false_negative) | |
160 | + VN = evaluation.true_negative_len | |
161 | + if (VP+FP)==0 or (VP+FN)==0 or (VN+FP)==0 or (VN+FN)==0: | |
162 | + return 0 | |
163 | + MCC = (((VP*VN)-(FP*FN))/math.sqrt((VP+FP)*(VP+FN)*(VN+FP)*(VN+FN))) | |
164 | + return MCC | |
165 | + | |
143 | 166 | class F_score(Metric): |
144 | 167 | """ |
145 | 168 | Classification accuracy metric which correlates precision and recall into an | ... | ... |
... | ... | @@ -0,0 +1,51 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + recommender suite - recommender experiments suite | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import sys | |
23 | +import os | |
24 | +sys.path.insert(0,'../') | |
25 | +from config import Config | |
26 | +from data import PopconXapianIndex, PopconSubmission | |
27 | +from recommender import Recommender | |
28 | +from user import LocalSystem, User | |
29 | +from evaluation import * | |
30 | +import logging | |
31 | +import random | |
32 | +import Gnuplot | |
33 | + | |
34 | +if __name__ == '__main__': | |
35 | + | |
36 | + cfg = Config() | |
37 | + cfg.index_mode = "recluster" | |
38 | + logging.info("Starting clustering experiments") | |
39 | + logging.info("Medoids: %d\t Max popcon:%d" % (cfg.k_medoids,cfg.max_popcon)) | |
40 | + cfg.popcon_dir = os.path.expanduser("~/org/popcon.debian.org/popcon-mail/popcon-entries/") | |
41 | + cfg.popcon_index = cfg.popcon_index+("_%dmedoids%dmax" % | |
42 | + (cfg.k_medoids,cfg.max_popcon)) | |
43 | + cfg.clusters_dir = cfg.clusters_dir+("_%dmedoids%dmax" % | |
44 | + (cfg.k_medoids,cfg.max_popcon)) | |
45 | + pxi = PopconXapianIndex(cfg) | |
46 | + logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion) | |
47 | + # Write clustering log | |
48 | + output = open(("results/clustering/%dmedoids%dmax" % (cfg.k_medoids,cfg.max_popcon)),'w') | |
49 | + output.write("# k_medoids\tmax_popcon\tdispersion\n") | |
50 | + output.write("%d %f\n" % (cfg.k_medoids,cfg.max_popcon,pxi.cluster_dispersion)) | |
51 | + output.close() | ... | ... |
... | ... | @@ -0,0 +1,27 @@ |
1 | +[DEFAULT] | |
2 | +repetitions = 1 | |
3 | +iterations = 10 | |
4 | +path = 'results' | |
5 | +experiment = 'grid' | |
6 | +weight = ['bm25', 'trad'] | |
7 | +;profile_size = range(10,100,10) | |
8 | +;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] | |
9 | +sample = [0.6, 0.7, 0.8, 0.9] | |
10 | + | |
11 | +[content] | |
12 | +strategy = ['cb','cbt','cbd'] | |
13 | + | |
14 | +[clustering] | |
15 | +experiment = 'single' | |
16 | +;iterations = 4 | |
17 | +;medoids = range(2,6) | |
18 | +iterations = 6 | |
19 | +medoids = [100,500,1000,5000,10000,50000] | |
20 | +;disabled for this experiment | |
21 | +weight = 0 | |
22 | +profile_size = 0 | |
23 | +sample = 0 | |
24 | + | |
25 | +[colaborative] | |
26 | +users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"] | |
27 | +neighbors = range(10,1010,50) | ... | ... |
... | ... | @@ -0,0 +1,171 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + recommender suite - recommender experiments suite | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import expsuite | |
23 | +import sys | |
24 | +sys.path.insert(0,'../') | |
25 | +from config import Config | |
26 | +from data import PopconXapianIndex, PopconSubmission | |
27 | +from recommender import Recommender | |
28 | +from user import LocalSystem, User | |
29 | +from evaluation import * | |
30 | +import logging | |
31 | +import random | |
32 | +import Gnuplot | |
33 | + | |
34 | +class ClusteringSuite(expsuite.PyExperimentSuite): | |
35 | + def reset(self, params, rep): | |
36 | + self.cfg = Config() | |
37 | + self.cfg.popcon_index = "../tests/test_data/.sample_pxi" | |
38 | + self.cfg.popcon_dir = "../tests/test_data/popcon_dir" | |
39 | + self.cfg.clusters_dir = "../tests/test_data/clusters_dir" | |
40 | + | |
41 | + if params['name'] == "clustering": | |
42 | + logging.info("Starting 'clustering' experiments suite...") | |
43 | + self.cfg.index_mode = "recluster" | |
44 | + | |
45 | + def iterate(self, params, rep, n): | |
46 | + if params['name'] == "clustering": | |
47 | + logging.info("Running iteration %d" % params['medoids'][n]) | |
48 | + self.cfg.k_medoids = params['medoids'][n] | |
49 | + pxi = PopconXapianIndex(self.cfg) | |
50 | + result = {'k_medoids': params['medoids'][n], | |
51 | + 'dispersion': pxi.cluster_dispersion} | |
52 | + else: | |
53 | + result = {} | |
54 | + return result | |
55 | + | |
56 | +class ContentBasedSuite(expsuite.PyExperimentSuite): | |
57 | + def reset(self, params, rep): | |
58 | + if params['name'].startswith("content"): | |
59 | + cfg = Config() | |
60 | + #if the index was not built yet | |
61 | + #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi") | |
62 | + cfg.axi = "data/AppAxi" | |
63 | + cfg.index_mode = "old" | |
64 | + cfg.weight = params['weight'] | |
65 | + self.rec = Recommender(cfg) | |
66 | + self.rec.set_strategy(params['strategy']) | |
67 | + self.repo_size = self.rec.items_repository.get_doccount() | |
68 | + self.user = LocalSystem() | |
69 | + self.user.app_pkg_profile(self.rec.items_repository) | |
70 | + self.user.no_auto_pkg_profile() | |
71 | + self.sample_size = int(len(self.user.pkg_profile)*params['sample']) | |
72 | + # iteration should be set to 10 in config file | |
73 | + #self.profile_size = range(10,101,10) | |
74 | + | |
75 | + def iterate(self, params, rep, n): | |
76 | + if params['name'].startswith("content"): | |
77 | + item_score = dict.fromkeys(self.user.pkg_profile,1) | |
78 | + # Prepare partition | |
79 | + sample = {} | |
80 | + for i in range(self.sample_size): | |
81 | + key = random.choice(item_score.keys()) | |
82 | + sample[key] = item_score.pop(key) | |
83 | + # Get full recommendation | |
84 | + user = User(item_score) | |
85 | + recommendation = self.rec.get_recommendation(user,self.repo_size) | |
86 | + # Write recall log | |
87 | + recall_file = "results/content/recall/%s-%s-%.2f-%d" % \ | |
88 | + (params['strategy'],params['weight'],params['sample'],n) | |
89 | + output = open(recall_file,'w') | |
90 | + output.write("# weight=%s\n" % params['weight']) | |
91 | + output.write("# strategy=%s\n" % params['strategy']) | |
92 | + output.write("# sample=%f\n" % params['sample']) | |
93 | + output.write("\n%d %d %d\n" % \ | |
94 | + (self.repo_size,len(item_score),self.sample_size)) | |
95 | + notfound = [] | |
96 | + ranks = [] | |
97 | + for pkg in sample.keys(): | |
98 | + if pkg in recommendation.ranking: | |
99 | + ranks.append(recommendation.ranking.index(pkg)) | |
100 | + else: | |
101 | + notfound.append(pkg) | |
102 | + for r in sorted(ranks): | |
103 | + output.write(str(r)+"\n") | |
104 | + if notfound: | |
105 | + output.write("Out of recommendation:\n") | |
106 | + for pkg in notfound: | |
107 | + output.write(pkg+"\n") | |
108 | + output.close() | |
109 | + # Plot metrics summary | |
110 | + accuracy = [] | |
111 | + precision = [] | |
112 | + recall = [] | |
113 | + f1 = [] | |
114 | + g = Gnuplot.Gnuplot() | |
115 | + g('set style data lines') | |
116 | + g.xlabel('Recommendation size') | |
117 | + for size in range(1,len(recommendation.ranking)+1,100): | |
118 | + predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1)) | |
119 | + real = RecommendationResult(sample) | |
120 | + evaluation = Evaluation(predicted,real,self.repo_size) | |
121 | + accuracy.append([size,evaluation.run(Accuracy())]) | |
122 | + precision.append([size,evaluation.run(Precision())]) | |
123 | + recall.append([size,evaluation.run(Recall())]) | |
124 | + f1.append([size,evaluation.run(F1())]) | |
125 | + g.plot(Gnuplot.Data(accuracy,title="Accuracy"), | |
126 | + Gnuplot.Data(precision,title="Precision"), | |
127 | + Gnuplot.Data(recall,title="Recall"), | |
128 | + Gnuplot.Data(f1,title="F1")) | |
129 | + g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1) | |
130 | + # Iteration log | |
131 | + result = {'iteration': n, | |
132 | + 'weight': params['weight'], | |
133 | + 'strategy': params['strategy'], | |
134 | + 'accuracy': accuracy[20], | |
135 | + 'precision': precision[20], | |
136 | + 'recall:': recall[20], | |
137 | + 'f1': f1[20]} | |
138 | + return result | |
139 | + | |
140 | +#class CollaborativeSuite(expsuite.PyExperimentSuite): | |
141 | +# def reset(self, params, rep): | |
142 | +# if params['name'].startswith("collaborative"): | |
143 | +# | |
144 | +# def iterate(self, params, rep, n): | |
145 | +# if params['name'].startswith("collaborative"): | |
146 | +# for root, dirs, files in os.walk(self.source_dir): | |
147 | +# for popcon_file in files: | |
148 | +# submission = PopconSubmission(os.path.join(root,popcon_file)) | |
149 | +# user = User(submission.packages) | |
150 | +# user.maximal_pkg_profile() | |
151 | +# rec.get_recommendation(user) | |
152 | +# precision = 0 | |
153 | +# result = {'weight': params['weight'], | |
154 | +# 'strategy': params['strategy'], | |
155 | +# 'profile_size': self.profile_size[n], | |
156 | +# 'accuracy': accuracy, | |
157 | +# 'precision': precision, | |
158 | +# 'recall:': recall, | |
159 | +# 'f1': } | |
160 | +# else: | |
161 | +# result = {} | |
162 | +# return result | |
163 | + | |
164 | +if __name__ == '__main__': | |
165 | + | |
166 | + if "clustering" in sys.argv or len(sys.argv)<3: | |
167 | + ClusteringSuite().start() | |
168 | + if "content" in sys.argv or len(sys.argv)<3: | |
169 | + ContentBasedSuite().start() | |
170 | + #if "collaborative" in sys.argv or len(sys.argv)<3: | |
171 | + #CollaborativeSuite().start() | ... | ... |
... | ... | @@ -0,0 +1,49 @@ |
1 | +#! /usr/bin/env python | |
2 | +""" | |
3 | + sample-popcon - extract a sample from popcon population | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import xapian | |
23 | +import os | |
24 | +import random | |
25 | +import sys | |
26 | + | |
27 | +if __name__ == '__main__': | |
28 | + try: | |
29 | + sample_file = sys.argv[1] | |
30 | + popcon = xapian.WritableDatabase(sys.argv[2],xapian.DB_OPEN) | |
31 | + except: | |
32 | + print "Usage: extract-sample-db sample_file popcon_index" | |
33 | + exit(1) | |
34 | + enquire = xapian.Enquire(popcon) | |
35 | + print sample_file.split("/") | |
36 | + new_popcon = xapian.WritableDatabase(sys.argv[2]+"-"+sample_file.split("/")[-1],xapian.DB_CREATE_OR_OVERWRITE) | |
37 | + print ("Popcon repository size: %d" % popcon.get_doccount()) | |
38 | + for submission in open(sample_file): | |
39 | + print "ID"+submission.strip() | |
40 | + query = xapian.Query("ID"+submission.strip()) | |
41 | + enquire.set_query(query) | |
42 | + mset = enquire.get_mset(0,20) | |
43 | + for m in mset: | |
44 | + print "Adding doc %s"%m.docid | |
45 | + new_popcon.add_document(popcon.get_document(m.docid)) | |
46 | + print "Removing doc %s"%m.docid | |
47 | + popcon.delete_document(m.docid) | |
48 | + print ("Popcon repository size: %d" % popcon.get_doccount()) | |
49 | + print ("Popcon repository size: %d" % new_popcon.get_doccount()) | ... | ... |
... | ... | @@ -0,0 +1,197 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + hybrid-suite | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import sys | |
23 | +sys.path.insert(0,'../') | |
24 | +from config import Config | |
25 | +from data import PopconXapianIndex, PopconSubmission | |
26 | +from recommender import Recommender | |
27 | +from user import LocalSystem, User | |
28 | +from evaluation import * | |
29 | +import logging | |
30 | +import random | |
31 | +import Gnuplot | |
32 | +import numpy | |
33 | + | |
34 | +if __name__ == '__main__': | |
35 | + if len(sys.argv)<2: | |
36 | + print "Usage: hybrid strategy sample_file" | |
37 | + exit(1) | |
38 | + | |
39 | + iterations = 20 | |
40 | + profile_size = [10,40,70,100,170,240] | |
41 | + neighbor_size = [3,10,50,100,200,400] | |
42 | + | |
43 | + #hybrid_strategies = ['knnco','knnco_eset'] | |
44 | + | |
45 | + #iterations = 1 | |
46 | + #profile_size = [10,20,30] | |
47 | + #neighbor_size = [10,20,30] | |
48 | + | |
49 | + cfg = Config() | |
50 | + population_sample = [] | |
51 | + strategy = sys.argv[1] | |
52 | + sample_file = sys.argv[2] | |
53 | + sample_str = sample_file.split('/')[-1] | |
54 | + with open(sample_file,'r') as f: | |
55 | + for line in f.readlines(): | |
56 | + user_id = line.strip('\n') | |
57 | + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id)) | |
58 | + sample_dir = ("results/hybrid/%s" % sample_str) | |
59 | + if not os.path.exists(sample_dir): | |
60 | + os.makedirs(sample_dir) | |
61 | + | |
62 | + cfg.strategy = strategy | |
63 | + p_20_summary = {} | |
64 | + f05_100_summary = {} | |
65 | + c_20 = {} | |
66 | + c_100 = {} | |
67 | + | |
68 | + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy) | |
69 | + graph_20 = {} | |
70 | + graph_100 = {} | |
71 | + graph_20_jpg = {} | |
72 | + graph_100_jpg = {} | |
73 | + comment_20 = {} | |
74 | + comment_100 = {} | |
75 | + for k in neighbor_size: | |
76 | + graph_20[k] = log_file+("-neighboorhod%.3d-020.png"%k) | |
77 | + graph_100[k] = log_file+("-neighboorhod%.3d-100.png"%k) | |
78 | + graph_20_jpg[k] = graph_20[k].strip(".png")+".jpg" | |
79 | + graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg" | |
80 | + comment_20[k] = graph_20_jpg[k]+".comment" | |
81 | + comment_100[k] = graph_100_jpg[k]+".comment" | |
82 | + | |
83 | + with open(comment_20[k],'w') as f: | |
84 | + f.write("# %s\n" % sample_str) | |
85 | + f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" % | |
86 | + (cfg.strategy,iterations)) | |
87 | + f.write("# neighboorhood\tprofile\tp_20\tc_20\n\n") | |
88 | + with open(comment_100[k],'w') as f: | |
89 | + f.write("# %s\n" % sample_str) | |
90 | + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" % | |
91 | + (cfg.strategy,iterations)) | |
92 | + f.write("# neighboorhood\tprofile\tf05_100\tc_100\n\n") | |
93 | + | |
94 | + c_20[k] = {} | |
95 | + c_100[k] = {} | |
96 | + p_20_summary[k] = {} | |
97 | + f05_100_summary[k] = {} | |
98 | + for size in profile_size: | |
99 | + c_20[k][size] = set() | |
100 | + c_100[k][size] = set() | |
101 | + p_20_summary[k][size] = [] | |
102 | + f05_100_summary[k][size] = [] | |
103 | + with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'w') as f: | |
104 | + f.write("# %s\n" % sample_str) | |
105 | + f.write("# strategy %s-neighboorhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size)) | |
106 | + f.write("# p_20\t\tf05_100\n\n") | |
107 | + | |
108 | + # main loop per user | |
109 | + for submission_file in population_sample: | |
110 | + user = PopconSystem(submission_file) | |
111 | + user.filter_pkg_profile(cfg.pkgs_filter) | |
112 | + user.maximal_pkg_profile() | |
113 | + for k in neighbor_size: | |
114 | + cfg.k_neighbors = k | |
115 | + for size in profile_size: | |
116 | + cfg.profile_size = size | |
117 | + rec = Recommender(cfg) | |
118 | + repo_size = rec.items_repository.get_doccount() | |
119 | + p_20 = [] | |
120 | + f05_100 = [] | |
121 | + for n in range(iterations): | |
122 | + # Fill sample profile | |
123 | + profile_len = len(user.pkg_profile) | |
124 | + item_score = {} | |
125 | + for pkg in user.pkg_profile: | |
126 | + item_score[pkg] = user.item_score[pkg] | |
127 | + sample = {} | |
128 | + sample_size = int(profile_len*0.9) | |
129 | + for i in range(sample_size): | |
130 | + key = random.choice(item_score.keys()) | |
131 | + sample[key] = item_score.pop(key) | |
132 | + iteration_user = User(item_score) | |
133 | + recommendation = rec.get_recommendation(iteration_user,repo_size) | |
134 | + if hasattr(recommendation,"ranking"): | |
135 | + ranking = recommendation.ranking | |
136 | + real = RecommendationResult(sample) | |
137 | + predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1)) | |
138 | + evaluation = Evaluation(predicted_20,real,repo_size) | |
139 | + p_20.append(evaluation.run(Precision())) | |
140 | + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1)) | |
141 | + evaluation = Evaluation(predicted_100,real,repo_size) | |
142 | + f05_100.append(evaluation.run(F_score(0.5))) | |
143 | + c_20[k][size] = c_20[k][size].union(recommendation.ranking[:20]) | |
144 | + c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100]) | |
145 | + # save summary | |
146 | + if p_20: | |
147 | + p_20_summary[k][size].append(sum(p_20)/len(p_20)) | |
148 | + if f05_100: | |
149 | + f05_100_summary[k][size].append(sum(f05_100)/len(f05_100)) | |
150 | + | |
151 | + with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'a') as f: | |
152 | + f.write("%.4f\t\t%.4f\n" % | |
153 | + ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100)))) | |
154 | + | |
155 | + # back to main flow | |
156 | + coverage_20 = {} | |
157 | + coverage_100 = {} | |
158 | + for k in neighbor_size: | |
159 | + coverage_20[k] = {} | |
160 | + coverage_100[k] = {} | |
161 | + with open(comment_20[k],'a') as f: | |
162 | + for size in profile_size: | |
163 | + coverage_20[k][size] = len(c_20[k][size])/float(repo_size) | |
164 | + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" % | |
165 | + (k,size,float(sum(p_20_summary[k][size]))/len(p_20_summary[k][size]),coverage_20[k][size])) | |
166 | + with open(comment_100[k],'a') as f: | |
167 | + for size in profile_size: | |
168 | + coverage_100[k][size] = len(c_100[k][size])/float(repo_size) | |
169 | + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" % | |
170 | + (k,size,float(sum(f05_100_summary[k][size]))/len(f05_100_summary[k][size]),coverage_100[k][size])) | |
171 | + | |
172 | + for k in neighbor_size: | |
173 | + # plot results summary | |
174 | + g = Gnuplot.Gnuplot() | |
175 | + g('set style data lines') | |
176 | + g('set yrange [0:1.0]') | |
177 | + g.xlabel('Profile size') | |
178 | + g.title("Setup: %s-neighboorhood%3d (threshold 20)" % (cfg.strategy,k)) | |
179 | + g.plot(Gnuplot.Data(sorted([[i,sum(p_20_summary[k][i])/len(p_20_summary[k][i])] | |
180 | + for i in p_20_summary[k].keys()]),title="Precision"), | |
181 | + Gnuplot.Data(sorted([[i,coverage_20[k][i]] | |
182 | + for i in coverage_20[k].keys()]),title="Coverage")) | |
183 | + g.hardcopy(graph_20[k],terminal="png") | |
184 | + #commands.getoutput("convert -quality 100 %s %s" % | |
185 | + # (graph_20[k],graph_20_jpg[k])) | |
186 | + g = Gnuplot.Gnuplot() | |
187 | + g('set style data lines') | |
188 | + g('set yrange [0:1.0]') | |
189 | + g.xlabel('Profile size') | |
190 | + g.title("Setup: %s-neighboorhood%3d (threshold 100)" % (cfg.strategy,k)) | |
191 | + g.plot(Gnuplot.Data(sorted([[i,sum(f05_100_summary[k][i])/len(f05_100_summary[k][i])] | |
192 | + for i in f05_100_summary[k].keys()]),title="F05"), | |
193 | + Gnuplot.Data(sorted([[i,coverage_100[k][i]] | |
194 | + for i in coverage_100[k].keys()]),title="Coverage")) | |
195 | + g.hardcopy(graph_100[k],terminal="png") | |
196 | + #commands.getoutput("convert -quality 100 %s %s" % | |
197 | + # (graph_100[k],graph_100_jpg[k])) | ... | ... |
... | ... | @@ -0,0 +1,186 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + k-suite - experiment different neighborhood sizes | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import sys | |
23 | +sys.path.insert(0,'../') | |
24 | +from config import Config | |
25 | +from data import PopconXapianIndex, PopconSubmission | |
26 | +from recommender import Recommender | |
27 | +from user import LocalSystem, User | |
28 | +from evaluation import * | |
29 | +import logging | |
30 | +import random | |
31 | +import Gnuplot | |
32 | +import numpy | |
33 | + | |
34 | +def plot_roc(k,roc_points,log_file): | |
35 | + g = Gnuplot.Gnuplot() | |
36 | + g('set style data points') | |
37 | + g.xlabel('False Positive Rate') | |
38 | + g.ylabel('True Positive Rate') | |
39 | + g('set xrange [0:1.0]') | |
40 | + g('set yrange [0:1.0]') | |
41 | + g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k)) | |
42 | + g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"), | |
43 | + Gnuplot.Data(roc_points)) | |
44 | + g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png") | |
45 | + g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1) | |
46 | + | |
47 | +def plot_summary(precision,f05,mcc,log_file): | |
48 | + g = Gnuplot.Gnuplot() | |
49 | + g('set style data lines') | |
50 | + g.xlabel('Neighborhood (k)') | |
51 | + g.title("Setup: %s-size20" % (log_file.split("/")[-1])) | |
52 | + g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"), | |
53 | + Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"), | |
54 | + Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC")) | |
55 | + g.hardcopy(log_file+(".png"),terminal="png") | |
56 | + g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1) | |
57 | + | |
58 | +class ExperimentResults: | |
59 | + def __init__(self,repo_size): | |
60 | + self.repository_size = repo_size | |
61 | + self.precision = [] | |
62 | + self.recall = [] | |
63 | + self.fpr = [] | |
64 | + self.f05 = [] | |
65 | + self.mcc = [] | |
66 | + | |
67 | + def add_result(self,ranking,sample): | |
68 | + predicted = RecommendationResult(dict.fromkeys(ranking,1)) | |
69 | + real = RecommendationResult(sample) | |
70 | + evaluation = Evaluation(predicted,real,self.repository_size) | |
71 | + self.precision.append(evaluation.run(Precision())) | |
72 | + self.recall.append(evaluation.run(Recall())) | |
73 | + self.fpr.append(evaluation.run(FPR())) | |
74 | + self.f05.append(evaluation.run(F_score(0.5))) | |
75 | + self.mcc.append(evaluation.run(MCC())) | |
76 | + | |
77 | + def get_roc_point(self): | |
78 | + tpr = self.recall | |
79 | + fpr = self.fpr | |
80 | + if not tpr or not fpr: | |
81 | + return [0,0] | |
82 | + return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)] | |
83 | + | |
84 | + def get_precision_summary(self): | |
85 | + if not self.precision: return 0 | |
86 | + return sum(self.precision)/len(self.precision) | |
87 | + | |
88 | + def get_f05_summary(self): | |
89 | + if not self.f05: return 0 | |
90 | + return sum(self.f05)/len(self.f05) | |
91 | + | |
92 | + def get_mcc_summary(self): | |
93 | + if not self.mcc: return 0 | |
94 | + return sum(self.mcc)/len(self.mcc) | |
95 | + | |
96 | +if __name__ == '__main__': | |
97 | + if len(sys.argv)<3: | |
98 | + print "Usage: k-suite strategy_str sample_file" | |
99 | + exit(1) | |
100 | + threshold = 20 | |
101 | + iterations = 30 | |
102 | + neighbors = [3,5,10,50,100,150,200,300,400,500] | |
103 | + cfg = Config() | |
104 | + cfg.strategy = sys.argv[1] | |
105 | + sample_file = sys.argv[2] | |
106 | + population_sample = [] | |
107 | + with open(sample_file,'r') as f: | |
108 | + for line in f.readlines(): | |
109 | + user_id = line.strip('\n') | |
110 | + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id)) | |
111 | + # setup dictionaries and files | |
112 | + roc_summary = {} | |
113 | + recommended = {} | |
114 | + precision_summary = {} | |
115 | + f05_summary = {} | |
116 | + mcc_summary = {} | |
117 | + sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1]) | |
118 | + if not os.path.exists(sample_dir): | |
119 | + os.makedirs(sample_dir) | |
120 | + log_file = os.path.join(sample_dir,cfg.strategy) | |
121 | + with open(log_file,'w') as f: | |
122 | + f.write("# %s\n\n" % sample_file.split('/')[-1]) | |
123 | + f.write("# strategy %s recommendation_size %d iterations %d\n\n" % | |
124 | + (cfg.strategy,threshold,iterations)) | |
125 | + f.write("# k coverage \tprecision \tf05 \tmcc\n\n") | |
126 | + | |
127 | + for k in neighbors: | |
128 | + roc_summary[k] = [] | |
129 | + recommended[k] = set() | |
130 | + precision_summary[k] = [] | |
131 | + f05_summary[k] = [] | |
132 | + mcc_summary[k] = [] | |
133 | + with open(log_file+"-k%.3d"%k,'w') as f: | |
134 | + f.write("# %s\n\n" % sample_file.split('/')[-1]) | |
135 | + f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k)) | |
136 | + f.write("# roc_point \tprecision \tf05 \tmcc\n\n") | |
137 | + | |
138 | + # main loop per user | |
139 | + for submission_file in population_sample: | |
140 | + user = PopconSystem(submission_file) | |
141 | + user.filter_pkg_profile(cfg.pkgs_filter) | |
142 | + user.maximal_pkg_profile() | |
143 | + for k in neighbors: | |
144 | + cfg.k_neighbors = k | |
145 | + rec = Recommender(cfg) | |
146 | + repo_size = rec.items_repository.get_doccount() | |
147 | + results = ExperimentResults(repo_size) | |
148 | + # n iterations for same recommender and user | |
149 | + for n in range(iterations): | |
150 | + # Fill sample profile | |
151 | + profile_len = len(user.pkg_profile) | |
152 | + item_score = {} | |
153 | + for pkg in user.pkg_profile: | |
154 | + item_score[pkg] = user.item_score[pkg] | |
155 | + sample = {} | |
156 | + sample_size = int(profile_len*0.9) | |
157 | + for i in range(sample_size): | |
158 | + key = random.choice(item_score.keys()) | |
159 | + sample[key] = item_score.pop(key) | |
160 | + iteration_user = User(item_score) | |
161 | + recommendation = rec.get_recommendation(iteration_user,threshold) | |
162 | + if hasattr(recommendation,"ranking"): | |
163 | + results.add_result(recommendation.ranking,sample) | |
164 | + recommended[k] = recommended[k].union(recommendation.ranking) | |
165 | + # save summary | |
166 | + roc_point = results.get_roc_point() | |
167 | + roc_summary[k].append(roc_point) | |
168 | + precision = results.get_precision_summary() | |
169 | + precision_summary[k].append(precision) | |
170 | + f05 = results.get_f05_summary() | |
171 | + f05_summary[k].append(f05) | |
172 | + mcc = results.get_mcc_summary() | |
173 | + mcc_summary[k].append(mcc) | |
174 | + with open(log_file+"-k%.3d"%k,'a') as f: | |
175 | + f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" % | |
176 | + (roc_point[0],roc_point[1],precision,f05,mcc)) | |
177 | + # back to main flow | |
178 | + with open(log_file,'a') as f: | |
179 | + plot_summary(precision_summary,f05_summary,mcc_summary,log_file) | |
180 | + for k in neighbors: | |
181 | + coverage = len(recommended[size])/float(repo_size) | |
182 | + f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" % | |
183 | + (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]), | |
184 | + float(sum(f05_summary[k]))/len(f05_summary[k]), | |
185 | + float(sum(mcc_summary[k]))/len(mcc_summary[k]))) | |
186 | + plot_roc(k,roc_summary[k],log_file) | ... | ... |
... | ... | @@ -0,0 +1,51 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + recommender suite - recommender experiments suite | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import sys | |
23 | +import os | |
24 | +sys.path.insert(0,'../') | |
25 | +from config import Config | |
26 | +from data import PopconXapianIndex, PopconSubmission | |
27 | +from recommender import Recommender | |
28 | +from user import LocalSystem, User | |
29 | +from evaluation import * | |
30 | +import logging | |
31 | +import random | |
32 | +import Gnuplot | |
33 | + | |
34 | +if __name__ == '__main__': | |
35 | + | |
36 | + cfg = Config() | |
37 | + cfg.index_mode = "recluster" | |
38 | + logging.info("Starting clustering experiments") | |
39 | + logging.info("Medoids: %d\t Max popcon:%d" % (cfg.k_medoids,cfg.max_popcon)) | |
40 | + cfg.popcon_dir = os.path.expanduser("~/org/popcon.debian.org/popcon-mail/popcon-entries/") | |
41 | + cfg.popcon_index = cfg.popcon_index+("_%dmedoids%dmax" % | |
42 | + (cfg.k_medoids,cfg.max_popcon)) | |
43 | + cfg.clusters_dir = cfg.clusters_dir+("_%dmedoids%dmax" % | |
44 | + (cfg.k_medoids,cfg.max_popcon)) | |
45 | + pxi = PopconXapianIndex(cfg) | |
46 | + logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion) | |
47 | + # Write clustering log | |
48 | + output = open(("results/clustering/%dmedoids%dmax" % (cfg.k_medoids,cfg.max_popcon)),'w') | |
49 | + output.write("# k_medoids\tmax_popcon\tdispersion\n") | |
50 | + output.write("%d %f\n" % (cfg.k_medoids,cfg.max_popcon,pxi.cluster_dispersion)) | |
51 | + output.close() | ... | ... |
... | ... | @@ -0,0 +1,27 @@ |
1 | +[DEFAULT] | |
2 | +repetitions = 1 | |
3 | +iterations = 10 | |
4 | +path = 'results' | |
5 | +experiment = 'grid' | |
6 | +weight = ['bm25', 'trad'] | |
7 | +;profile_size = range(10,100,10) | |
8 | +;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] | |
9 | +sample = [0.6, 0.7, 0.8, 0.9] | |
10 | + | |
11 | +[content] | |
12 | +strategy = ['cb','cbt','cbd'] | |
13 | + | |
14 | +[clustering] | |
15 | +experiment = 'single' | |
16 | +;iterations = 4 | |
17 | +;medoids = range(2,6) | |
18 | +iterations = 6 | |
19 | +medoids = [100,500,1000,5000,10000,50000] | |
20 | +;disabled for this experiment | |
21 | +weight = 0 | |
22 | +profile_size = 0 | |
23 | +sample = 0 | |
24 | + | |
25 | +[colaborative] | |
26 | +users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"] | |
27 | +neighbors = range(10,1010,50) | ... | ... |
... | ... | @@ -0,0 +1,171 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + recommender suite - recommender experiments suite | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import expsuite | |
23 | +import sys | |
24 | +sys.path.insert(0,'../') | |
25 | +from config import Config | |
26 | +from data import PopconXapianIndex, PopconSubmission | |
27 | +from recommender import Recommender | |
28 | +from user import LocalSystem, User | |
29 | +from evaluation import * | |
30 | +import logging | |
31 | +import random | |
32 | +import Gnuplot | |
33 | + | |
34 | +class ClusteringSuite(expsuite.PyExperimentSuite): | |
35 | + def reset(self, params, rep): | |
36 | + self.cfg = Config() | |
37 | + self.cfg.popcon_index = "../tests/test_data/.sample_pxi" | |
38 | + self.cfg.popcon_dir = "../tests/test_data/popcon_dir" | |
39 | + self.cfg.clusters_dir = "../tests/test_data/clusters_dir" | |
40 | + | |
41 | + if params['name'] == "clustering": | |
42 | + logging.info("Starting 'clustering' experiments suite...") | |
43 | + self.cfg.index_mode = "recluster" | |
44 | + | |
45 | + def iterate(self, params, rep, n): | |
46 | + if params['name'] == "clustering": | |
47 | + logging.info("Running iteration %d" % params['medoids'][n]) | |
48 | + self.cfg.k_medoids = params['medoids'][n] | |
49 | + pxi = PopconXapianIndex(self.cfg) | |
50 | + result = {'k_medoids': params['medoids'][n], | |
51 | + 'dispersion': pxi.cluster_dispersion} | |
52 | + else: | |
53 | + result = {} | |
54 | + return result | |
55 | + | |
56 | +class ContentBasedSuite(expsuite.PyExperimentSuite): | |
57 | + def reset(self, params, rep): | |
58 | + if params['name'].startswith("content"): | |
59 | + cfg = Config() | |
60 | + #if the index was not built yet | |
61 | + #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi") | |
62 | + cfg.axi = "data/AppAxi" | |
63 | + cfg.index_mode = "old" | |
64 | + cfg.weight = params['weight'] | |
65 | + self.rec = Recommender(cfg) | |
66 | + self.rec.set_strategy(params['strategy']) | |
67 | + self.repo_size = self.rec.items_repository.get_doccount() | |
68 | + self.user = LocalSystem() | |
69 | + self.user.app_pkg_profile(self.rec.items_repository) | |
70 | + self.user.no_auto_pkg_profile() | |
71 | + self.sample_size = int(len(self.user.pkg_profile)*params['sample']) | |
72 | + # iteration should be set to 10 in config file | |
73 | + #self.profile_size = range(10,101,10) | |
74 | + | |
75 | + def iterate(self, params, rep, n): | |
76 | + if params['name'].startswith("content"): | |
77 | + item_score = dict.fromkeys(self.user.pkg_profile,1) | |
78 | + # Prepare partition | |
79 | + sample = {} | |
80 | + for i in range(self.sample_size): | |
81 | + key = random.choice(item_score.keys()) | |
82 | + sample[key] = item_score.pop(key) | |
83 | + # Get full recommendation | |
84 | + user = User(item_score) | |
85 | + recommendation = self.rec.get_recommendation(user,self.repo_size) | |
86 | + # Write recall log | |
87 | + recall_file = "results/content/recall/%s-%s-%.2f-%d" % \ | |
88 | + (params['strategy'],params['weight'],params['sample'],n) | |
89 | + output = open(recall_file,'w') | |
90 | + output.write("# weight=%s\n" % params['weight']) | |
91 | + output.write("# strategy=%s\n" % params['strategy']) | |
92 | + output.write("# sample=%f\n" % params['sample']) | |
93 | + output.write("\n%d %d %d\n" % \ | |
94 | + (self.repo_size,len(item_score),self.sample_size)) | |
95 | + notfound = [] | |
96 | + ranks = [] | |
97 | + for pkg in sample.keys(): | |
98 | + if pkg in recommendation.ranking: | |
99 | + ranks.append(recommendation.ranking.index(pkg)) | |
100 | + else: | |
101 | + notfound.append(pkg) | |
102 | + for r in sorted(ranks): | |
103 | + output.write(str(r)+"\n") | |
104 | + if notfound: | |
105 | + output.write("Out of recommendation:\n") | |
106 | + for pkg in notfound: | |
107 | + output.write(pkg+"\n") | |
108 | + output.close() | |
109 | + # Plot metrics summary | |
110 | + accuracy = [] | |
111 | + precision = [] | |
112 | + recall = [] | |
113 | + f1 = [] | |
114 | + g = Gnuplot.Gnuplot() | |
115 | + g('set style data lines') | |
116 | + g.xlabel('Recommendation size') | |
117 | + for size in range(1,len(recommendation.ranking)+1,100): | |
118 | + predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1)) | |
119 | + real = RecommendationResult(sample) | |
120 | + evaluation = Evaluation(predicted,real,self.repo_size) | |
121 | + accuracy.append([size,evaluation.run(Accuracy())]) | |
122 | + precision.append([size,evaluation.run(Precision())]) | |
123 | + recall.append([size,evaluation.run(Recall())]) | |
124 | + f1.append([size,evaluation.run(F1())]) | |
125 | + g.plot(Gnuplot.Data(accuracy,title="Accuracy"), | |
126 | + Gnuplot.Data(precision,title="Precision"), | |
127 | + Gnuplot.Data(recall,title="Recall"), | |
128 | + Gnuplot.Data(f1,title="F1")) | |
129 | + g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1) | |
130 | + # Iteration log | |
131 | + result = {'iteration': n, | |
132 | + 'weight': params['weight'], | |
133 | + 'strategy': params['strategy'], | |
134 | + 'accuracy': accuracy[20], | |
135 | + 'precision': precision[20], | |
136 | + 'recall:': recall[20], | |
137 | + 'f1': f1[20]} | |
138 | + return result | |
139 | + | |
140 | +#class CollaborativeSuite(expsuite.PyExperimentSuite): | |
141 | +# def reset(self, params, rep): | |
142 | +# if params['name'].startswith("collaborative"): | |
143 | +# | |
144 | +# def iterate(self, params, rep, n): | |
145 | +# if params['name'].startswith("collaborative"): | |
146 | +# for root, dirs, files in os.walk(self.source_dir): | |
147 | +# for popcon_file in files: | |
148 | +# submission = PopconSubmission(os.path.join(root,popcon_file)) | |
149 | +# user = User(submission.packages) | |
150 | +# user.maximal_pkg_profile() | |
151 | +# rec.get_recommendation(user) | |
152 | +# precision = 0 | |
153 | +# result = {'weight': params['weight'], | |
154 | +# 'strategy': params['strategy'], | |
155 | +# 'profile_size': self.profile_size[n], | |
156 | +# 'accuracy': accuracy, | |
157 | +# 'precision': precision, | |
158 | +# 'recall:': recall, | |
159 | +# 'f1': } | |
160 | +# else: | |
161 | +# result = {} | |
162 | +# return result | |
163 | + | |
164 | +if __name__ == '__main__': | |
165 | + | |
166 | + if "clustering" in sys.argv or len(sys.argv)<3: | |
167 | + ClusteringSuite().start() | |
168 | + if "content" in sys.argv or len(sys.argv)<3: | |
169 | + ContentBasedSuite().start() | |
170 | + #if "collaborative" in sys.argv or len(sys.argv)<3: | |
171 | + #CollaborativeSuite().start() | ... | ... |
... | ... | @@ -0,0 +1,74 @@ |
1 | +#! /usr/bin/env python | |
2 | +""" | |
3 | + misc_popcon - misc experiments with popcon data | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import Gnuplot | |
23 | +import xapian | |
24 | +import os | |
25 | +import random | |
26 | +import sys | |
27 | + | |
28 | +def get_population_profile(popcon): | |
29 | + profiles_size = [] | |
30 | + for n in range(1,popcon.get_doccount()): | |
31 | + user = popcon.get_document(n) | |
32 | + pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")] | |
33 | + if len(pkgs_profile)<10: | |
34 | + print "-- profile<10:",user.get_data() | |
35 | + profiles_size.append(len(pkgs_profile)) | |
36 | + max_profile = max(profiles_size) | |
37 | + population_profile = [(n,profiles_size.count(n)) | |
38 | + for n in range(max_profile+1) | |
39 | + if profiles_size.count(n)>0 ] | |
40 | + return population_profile,max_profile | |
41 | + | |
42 | +def get_profile_ranges(population_profile,max_profile,popcon_size): | |
43 | + ranges = range(0,251,50) | |
44 | + ranges.append(max_profile) | |
45 | + ranges_population = [] | |
46 | + ranges_percentage = [] | |
47 | + for maximum in ranges[1:]: | |
48 | + minimum = ranges[ranges.index(maximum)-1] | |
49 | + valid = [x[1] for x in population_profile | |
50 | + if x[0]>minimum and x[0]<=maximum] | |
51 | + ranges_population.append((maximum,sum(valid))) | |
52 | + ranges_percentage.append((maximum,sum(valid)/float(popcon_size))) | |
53 | + return ranges_population,ranges_percentage | |
54 | + | |
55 | +def plot(data,xlabel,ylabel,output): | |
56 | + g = Gnuplot.Gnuplot() | |
57 | + g('set style data points') | |
58 | + g.xlabel(xlabel) | |
59 | + g.ylabel(ylabel) | |
60 | + g.plot(data) | |
61 | + g.hardcopy(output+".png", terminal="png") | |
62 | + g.hardcopy(output+".ps", terminal="postscript", enhanced=1, color=1) | |
63 | + | |
64 | +if __name__ == '__main__': | |
65 | + popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps")) | |
66 | + print ("Popcon repository size: %d" % popcon.get_doccount()) | |
67 | + | |
68 | + profile_population,max_profile = get_population_profile(popcon) | |
69 | + ranges_population,ranges_percentage = get_profile_ranges(profile_population, | |
70 | + max_profile,popcon.get_doccount()) | |
71 | + print "Population per profile range (up to index)" | |
72 | + print ranges_population | |
73 | + plot(profile_population,"Desktop profile size","Population size", | |
74 | + "results/misc-popcon/profile_population") | ... | ... |
... | ... | @@ -0,0 +1,199 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + profile-suite - experiment different profile sizes | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import sys | |
23 | +sys.path.insert(0,'../') | |
24 | +from config import Config | |
25 | +from data import PopconXapianIndex, PopconSubmission | |
26 | +from recommender import Recommender | |
27 | +from user import LocalSystem, User | |
28 | +from evaluation import * | |
29 | +import logging | |
30 | +import random | |
31 | +import Gnuplot | |
32 | +import numpy | |
33 | + | |
34 | +if __name__ == '__main__': | |
35 | + if len(sys.argv)<2: | |
36 | + print "Usage: profile-suite strategy_category sample_file" | |
37 | + exit(1) | |
38 | + | |
39 | + iterations = 20 | |
40 | + profile_size = [10,20,40,70,100,140,170,200,240] | |
41 | + neighbor_size = [3,5,10,50,100,150,200,300,400,500] | |
42 | + | |
43 | + content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset'] | |
44 | + collaborative_strategies = ['knn_eset']#,'knn_eset','knn_plus'] | |
45 | + #collaborative_strategies = ['knn','knn_eset','knn_plus'] | |
46 | + | |
47 | + #iterations = 1 | |
48 | + #profile_size = [10,20,30] | |
49 | + #neighbor_size = [10,20,30] | |
50 | + #content_strategies = ['cb'] | |
51 | + #collaborative_strategies = ['knn_eset'] | |
52 | + | |
53 | + strategy_category = sys.argv[1] | |
54 | + if strategy_category == "content": | |
55 | + strategies = content_strategies | |
56 | + sizes = profile_size | |
57 | + option_str = "profile" | |
58 | + elif strategy_category == "collaborative": | |
59 | + strategies = collaborative_strategies | |
60 | + sizes = neighbor_size | |
61 | + option_str = "neighborhood" | |
62 | + else: | |
63 | + print "Usage: profile-suite strategy_category sample_file" | |
64 | + exit(1) | |
65 | + | |
66 | + cfg = Config() | |
67 | + population_sample = [] | |
68 | + sample_file = sys.argv[2] | |
69 | + sample_str = sample_file.split('/')[-1] | |
70 | + with open(sample_file,'r') as f: | |
71 | + for line in f.readlines(): | |
72 | + user_id = line.strip('\n') | |
73 | + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id)) | |
74 | + sample_dir = ("results/%s/%s" % | |
75 | + (strategy_category,sample_str)) | |
76 | + if not os.path.exists(sample_dir): | |
77 | + os.makedirs(sample_dir) | |
78 | + | |
79 | + for strategy in strategies: | |
80 | + cfg.strategy = strategy | |
81 | + p_20_summary = {} | |
82 | + f05_100_summary = {} | |
83 | + c_20 = {} | |
84 | + c_100 = {} | |
85 | + | |
86 | + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy) | |
87 | + graph_20 = log_file+"-20.png" | |
88 | + graph_100 = log_file+"-100.png" | |
89 | + graph_20_jpg = graph_20.strip(".png")+".jpg" | |
90 | + graph_100_jpg = graph_100.strip(".png")+".jpg" | |
91 | + comment_20 = graph_20_jpg+".comment" | |
92 | + comment_100 = graph_100_jpg+".comment" | |
93 | + | |
94 | + with open(comment_20,'w') as f: | |
95 | + f.write("# sample %s\n" % sample_str) | |
96 | + f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" % | |
97 | + (cfg.strategy,iterations)) | |
98 | + f.write("# %s\tp_20\tc_20\n\n"%option_str) | |
99 | + with open(comment_100,'w') as f: | |
100 | + f.write("# sample %s\n" % sample_str) | |
101 | + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" % | |
102 | + (cfg.strategy,iterations)) | |
103 | + f.write("# %s\t\tf05_100\t\tc_100\n\n"%option_str) | |
104 | + | |
105 | + for size in sizes: | |
106 | + c_20[size] = set() | |
107 | + c_100[size] = set() | |
108 | + p_20_summary[size] = [] | |
109 | + f05_100_summary[size] = [] | |
110 | + with open(log_file+"-%s%.3d"%(option_str,size),'w') as f: | |
111 | + f.write("# sample %s\n" % sample_str) | |
112 | + f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size)) | |
113 | + f.write("# p_20\tf05_100\n\n") | |
114 | + | |
115 | + # main loop per user | |
116 | + for submission_file in population_sample: | |
117 | + user = PopconSystem(submission_file) | |
118 | + user.filter_pkg_profile(cfg.pkgs_filter) | |
119 | + user.maximal_pkg_profile() | |
120 | + for size in sizes: | |
121 | + cfg.profile_size = size | |
122 | + cfg.k_neighbors = size | |
123 | + rec = Recommender(cfg) | |
124 | + repo_size = rec.items_repository.get_doccount() | |
125 | + p_20 = [] | |
126 | + f05_100 = [] | |
127 | + for n in range(iterations): | |
128 | + # Fill sample profile | |
129 | + profile_len = len(user.pkg_profile) | |
130 | + item_score = {} | |
131 | + for pkg in user.pkg_profile: | |
132 | + item_score[pkg] = user.item_score[pkg] | |
133 | + sample = {} | |
134 | + sample_size = int(profile_len*0.9) | |
135 | + for i in range(sample_size): | |
136 | + key = random.choice(item_score.keys()) | |
137 | + sample[key] = item_score.pop(key) | |
138 | + iteration_user = User(item_score) | |
139 | + recommendation = rec.get_recommendation(iteration_user,repo_size) | |
140 | + if hasattr(recommendation,"ranking"): | |
141 | + ranking = recommendation.ranking | |
142 | + real = RecommendationResult(sample) | |
143 | + predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1)) | |
144 | + evaluation = Evaluation(predicted_20,real,repo_size) | |
145 | + p_20.append(evaluation.run(Precision())) | |
146 | + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1)) | |
147 | + evaluation = Evaluation(predicted_100,real,repo_size) | |
148 | + f05_100.append(evaluation.run(F_score(0.5))) | |
149 | + c_20[size] = c_20[size].union(recommendation.ranking[:20]) | |
150 | + c_100[size] = c_100[size].union(recommendation.ranking[:100]) | |
151 | + # save summary | |
152 | + if p_20: | |
153 | + p_20_summary[size].append(sum(p_20)/len(p_20)) | |
154 | + if f05_100: | |
155 | + f05_100_summary[size].append(sum(f05_100)/len(f05_100)) | |
156 | + | |
157 | + with open(log_file+"-%s%.3d"%(option_str,size),'a') as f: | |
158 | + f.write("%.4f \t%.4f\n" % | |
159 | + ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100)))) | |
160 | + | |
161 | + # back to main flow | |
162 | + coverage_20 = {} | |
163 | + coverage_100 = {} | |
164 | + with open(comment_20,'a') as f: | |
165 | + for size in sizes: | |
166 | + coverage_20[size] = len(c_20[size])/float(repo_size) | |
167 | + f.write("%3d\t\t%.4f\t\t%.4f\n" % | |
168 | + (size,float(sum(p_20_summary[size]))/len(p_20_summary[size]),coverage_20[size])) | |
169 | + with open(comment_100,'a') as f: | |
170 | + for size in sizes: | |
171 | + coverage_100[size] = len(c_100[size])/float(repo_size) | |
172 | + f.write("%3d\t\t%.4f\t\t%.4f\n" % | |
173 | + (size,float(sum(f05_100_summary[size]))/len(f05_100_summary[size]),coverage_100[size])) | |
174 | + | |
175 | + # plot results summary | |
176 | + g = Gnuplot.Gnuplot() | |
177 | + g('set style data lines') | |
178 | + g('set yrange [0:1.0]') | |
179 | + g.xlabel('%s size'%option_str.capitalize()) | |
180 | + g.title("Setup: %s (threshold 20)" % cfg.strategy) | |
181 | + g.plot(Gnuplot.Data(sorted([[k,sum(p_20_summary[k])/len(p_20_summary[k])] | |
182 | + for k in p_20_summary.keys()]),title="Precision"), | |
183 | + Gnuplot.Data(sorted([[k,coverage_20[k]] | |
184 | + for k in coverage_20.keys()]),title="Coverage")) | |
185 | + g.hardcopy(graph_20,terminal="png") | |
186 | + commands.getoutput("convert -quality 20 %s %s" % | |
187 | + (graph_100,graph_20_jpg)) | |
188 | + g = Gnuplot.Gnuplot() | |
189 | + g('set style data lines') | |
190 | + g('set yrange [0:1.0]') | |
191 | + g.xlabel('%s size'%option_str.capitalize()) | |
192 | + g.title("Setup: %s (threshold 100)" % cfg.strategy) | |
193 | + g.plot(Gnuplot.Data(sorted([[k,sum(f05_100_summary[k])/len(f05_100_summary[k])] | |
194 | + for k in f05_100_summary.keys()]),title="F05"), | |
195 | + Gnuplot.Data(sorted([[k,coverage_100[k]] | |
196 | + for k in coverage_100.keys()]),title="Coverage")) | |
197 | + g.hardcopy(graph_100,terminal="png") | |
198 | + commands.getoutput("convert -quality 100 %s %s" % | |
199 | + (graph_100,graph_100_jpg)) | ... | ... |
... | ... | @@ -0,0 +1,231 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + recommender suite - recommender experiments suite | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import sys | |
23 | +sys.path.insert(0,'../') | |
24 | +from config import Config | |
25 | +from data import PopconXapianIndex, PopconSubmission | |
26 | +from recommender import Recommender | |
27 | +from user import LocalSystem, User | |
28 | +from evaluation import * | |
29 | +import logging | |
30 | +import random | |
31 | +import Gnuplot | |
32 | +import numpy | |
33 | + | |
34 | +#iterations = 3 | |
35 | +#sample_proportions = [0.9] | |
36 | +#weighting = [('bm25',1.2)] | |
37 | +#collaborative = ['knn_eset'] | |
38 | +#content_based = ['cb'] | |
39 | +#hybrid = ['knnco'] | |
40 | +#profile_size = [50,100] | |
41 | +#popcon_size = ["1000"] | |
42 | +#neighbors = [50] | |
43 | + | |
44 | +iterations = 30 | |
45 | +sample_proportions = [0.9] | |
46 | +weighting = [('bm25',1.0)] | |
47 | +content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset'] | |
48 | +collaborative = ['knn_eset','knn','knn_plus'] | |
49 | +hybrid = ['knnco','knnco_eset'] | |
50 | +profile_size = range(20,200,40) | |
51 | +neighbors = range(10,510,50) | |
52 | + | |
53 | +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file): | |
54 | + # Write recall log | |
55 | + output = open(("%s-%.2d" % (log_file,n)),'w') | |
56 | + output.write("# %s-n\n" % label["description"]) | |
57 | + output.write("# %s-%.2d\n" % (label["values"],n)) | |
58 | + output.write("\n# repository profile sample\n%d %d %d\n" % \ | |
59 | + (repo_size,profile_size,len(sample))) | |
60 | + if hasattr(recommendation,"ranking"): | |
61 | + notfound = [] | |
62 | + ranks = [] | |
63 | + for pkg in sample.keys(): | |
64 | + if pkg in recommendation.ranking: | |
65 | + ranks.append(recommendation.ranking.index(pkg)) | |
66 | + else: | |
67 | + notfound.append(pkg) | |
68 | + for r in sorted(ranks): | |
69 | + output.write(str(r)+"\n") | |
70 | + if notfound: | |
71 | + output.write("# out of recommendation:\n") | |
72 | + for pkg in notfound: | |
73 | + output.write(pkg+"\n") | |
74 | + output.close() | |
75 | + | |
76 | +def plot_roc(roc_points,eauc,c,p,log_file): | |
77 | + g = Gnuplot.Gnuplot() | |
78 | + g('set style data lines') | |
79 | + g.xlabel('False Positive Rate') | |
80 | + g.ylabel('True Positive Rate') | |
81 | + g('set xrange [0:1.0]') | |
82 | + g('set yrange [0:1.0]') | |
83 | + g.title("Setup: %s" % log_file.split("/")[-1]) | |
84 | + g('set label "C %.2f" at 0.8,0.25' % c) | |
85 | + g('set label "P(20) %.2f" at 0.8,0.2' % p) | |
86 | + g('set label "AUC %.4f" at 0.8,0.15' % eauc) | |
87 | + g.plot(Gnuplot.Data(roc_points,title="ROC"), | |
88 | + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7")) | |
89 | + #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6")) | |
90 | + g.hardcopy(log_file+"-roc.png",terminal="png") | |
91 | + g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1) | |
92 | + | |
93 | +def get_label(cfg,sample_proportion): | |
94 | + label = {} | |
95 | + if cfg.strategy in content_based: | |
96 | + label["description"] = "strategy-profile" | |
97 | + label["values"] = ("%s-profile%.3d" % | |
98 | + (cfg.strategy,cfg.profile_size)) | |
99 | + elif cfg.strategy in collaborative: | |
100 | + label["description"] = "strategy-knn" | |
101 | + label["values"] = ("%s-k%.3d" % | |
102 | + (cfg.strategy,cfg.k_neighbors)) | |
103 | + elif cfg.strategy in hybrid: | |
104 | + label["description"] = "strategy-knn-profile" | |
105 | + label["values"] = ("%s-k%.3d-profile%.3d" % | |
106 | + (cfg.strategy,cfg.k_neighbors,cfg.profile_size)) | |
107 | + else: | |
108 | + print "Unknown strategy" | |
109 | + return label | |
110 | + | |
111 | +class ExperimentResults: | |
112 | + def __init__(self,repo_size): | |
113 | + self.repository_size = repo_size | |
114 | + self.precision = {} | |
115 | + self.recall = {} | |
116 | + self.fpr = {} | |
117 | + points = [1]+range(10,self.repository_size,10) | |
118 | + self.recommended = set() | |
119 | + for size in points: | |
120 | + self.precision[size] = [] | |
121 | + self.recall[size] = [] | |
122 | + self.fpr[size] = [] | |
123 | + | |
124 | + def add_result(self,ranking,sample): | |
125 | + self.recommended = self.recommended.union(ranking) | |
126 | + # get data only for point | |
127 | + for size in self.precision.keys(): | |
128 | + predicted = RecommendationResult(dict.fromkeys(ranking[:size],1)) | |
129 | + real = RecommendationResult(sample) | |
130 | + evaluation = Evaluation(predicted,real,self.repository_size) | |
131 | + self.precision[size].append(evaluation.run(Precision())) | |
132 | + self.recall[size].append(evaluation.run(Recall())) | |
133 | + self.fpr[size].append(evaluation.run(FPR())) | |
134 | + | |
135 | + # Average ROC by threshold (= size of recommendation) | |
136 | + def get_roc_points(self): | |
137 | + points = [] | |
138 | + for size in self.recall.keys(): | |
139 | + tpr = self.recall[size] | |
140 | + fpr = self.fpr[size] | |
141 | + points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)]) | |
142 | + return sorted(points) | |
143 | + | |
144 | +def run_strategy(cfg,user): | |
145 | + for weight in weighting: | |
146 | + cfg.weight = weight[0] | |
147 | + cfg.bm25_k1 = weight[1] | |
148 | + rec = Recommender(cfg) | |
149 | + repo_size = rec.items_repository.get_doccount() | |
150 | + for proportion in sample_proportions: | |
151 | + results = ExperimentResults(repo_size) | |
152 | + label = get_label(cfg,proportion) | |
153 | + user_dir = ("results/roc-suite/%s" % user.user_id[:8]) | |
154 | + if not os.path.exists(user_dir): | |
155 | + os.mkdir(user_dir) | |
156 | + log_file = os.path.join(user_dir,label["values"]) | |
157 | + for n in range(iterations): | |
158 | + # Fill sample profile | |
159 | + profile_len = len(user.pkg_profile) | |
160 | + item_score = {} | |
161 | + for pkg in user.pkg_profile: | |
162 | + item_score[pkg] = user.item_score[pkg] | |
163 | + sample = {} | |
164 | + sample_size = int(profile_len*proportion) | |
165 | + for i in range(sample_size): | |
166 | + key = random.choice(item_score.keys()) | |
167 | + sample[key] = item_score.pop(key) | |
168 | + iteration_user = User(item_score) | |
169 | + recommendation = rec.get_recommendation(iteration_user,repo_size) | |
170 | + write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file) | |
171 | + if hasattr(recommendation,"ranking"): | |
172 | + results.add_result(recommendation.ranking,sample) | |
173 | + with open(log_file,'w') as f: | |
174 | + roc_points = results.get_roc_points() | |
175 | + x_coord = [p[0] for p in roc_points] | |
176 | + y_coord = [p[1] for p in roc_points] | |
177 | + auc = numpy.trapz(y=y_coord, x=x_coord) | |
178 | + eauc = (auc+ | |
179 | + numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+ | |
180 | + numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1])) | |
181 | + precision_20 = sum(results.precision[10])/len(results.precision[10]) | |
182 | + coverage = len(results.recommended)/float(repo_size) | |
183 | + f.write("# %s\n# %s\n\n" % | |
184 | + (label["description"],label["values"])) | |
185 | + f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" % | |
186 | + (coverage,precision_20,auc,eauc)) | |
187 | + plot_roc(roc_points,eauc,coverage,precision_20,log_file) | |
188 | + | |
189 | +def run_content(user,cfg): | |
190 | + for strategy in content_based: | |
191 | + cfg.strategy = strategy | |
192 | + for size in profile_size: | |
193 | + cfg.profile_size = size | |
194 | + run_strategy(cfg,user) | |
195 | + | |
196 | +def run_collaborative(user,cfg): | |
197 | + popcon_desktopapps = cfg.popcon_desktopapps | |
198 | + popcon_programs = cfg.popcon_programs | |
199 | + for strategy in collaborative: | |
200 | + cfg.strategy = strategy | |
201 | + for k in neighbors: | |
202 | + cfg.k_neighbors = k | |
203 | + run_strategy(cfg,user) | |
204 | + | |
205 | +def run_hybrid(user,cfg): | |
206 | + popcon_desktopapps = cfg.popcon_desktopapps | |
207 | + popcon_programs = cfg.popcon_programs | |
208 | + for strategy in hybrid: | |
209 | + cfg.strategy = strategy | |
210 | + for k in neighbors: | |
211 | + cfg.k_neighbors = k | |
212 | + for size in profile_size: | |
213 | + cfg.profile_size = size | |
214 | + run_strategy(cfg,user) | |
215 | + | |
216 | +if __name__ == '__main__': | |
217 | + if len(sys.argv)<2: | |
218 | + print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]" | |
219 | + exit(1) | |
220 | + | |
221 | + cfg = Config() | |
222 | + user = PopconSystem(sys.argv[1]) | |
223 | + user.filter_pkg_profile(cfg.pkgs_filter) | |
224 | + user.maximal_pkg_profile() | |
225 | + | |
226 | + if "content" in sys.argv or len(sys.argv)<3: | |
227 | + run_content(user,cfg) | |
228 | + if "collaborative" in sys.argv or len(sys.argv)<3: | |
229 | + run_collaborative(user,cfg) | |
230 | + if "hybrid" in sys.argv or len(sys.argv)<3: | |
231 | + run_hybrid(user,cfg) | ... | ... |
... | ... | @@ -0,0 +1,44 @@ |
1 | +#! /usr/bin/env python | |
2 | +""" | |
3 | + sample-popcon-arch - extract a sample of a specific arch | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | +import sys | |
22 | +sys.path.insert(0,'../') | |
23 | +import xapian | |
24 | +import os | |
25 | +import random | |
26 | +import sys | |
27 | +from user import RandomPopcon | |
28 | + | |
29 | +if __name__ == '__main__': | |
30 | + try: | |
31 | + size = int(sys.argv[1]) | |
32 | + arch = sys.argv[2] | |
33 | + popcon_dir = sys.argv[3] | |
34 | + pkgs_filter = sys.argv[4] | |
35 | + except: | |
36 | + print "Usage: sample-popcon-arch size arch popcon_dir pkgs_filter" | |
37 | + exit(1) | |
38 | + | |
39 | + sample_file = ("results/misc-popcon/sample-%s-%d" % (arch,size)) | |
40 | + with open(sample_file,'w') as f: | |
41 | + for n in range(1,size+1): | |
42 | + user = RandomPopcon(popcon_dir,arch,pkgs_filter) | |
43 | + f.write(user.user_id+'\n') | |
44 | + print "sample",n | ... | ... |
... | ... | @@ -0,0 +1,53 @@ |
1 | +#! /usr/bin/env python | |
2 | +""" | |
3 | + sample-popcon - extract a sample from popcon population | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import xapian | |
23 | +import os | |
24 | +import random | |
25 | +import sys | |
26 | + | |
27 | +def extract_sample(size,popcon,min_profile,max_profile,output): | |
28 | + sample = [] | |
29 | + for n in range(1,popcon.get_doccount()+1): | |
30 | + user = popcon.get_document(n) | |
31 | + pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")] | |
32 | + print len(pkgs_profile) | |
33 | + if len(pkgs_profile)>min_profile and len(pkgs_profile)<=max_profile: | |
34 | + sample.append(user.get_data()) | |
35 | + print n,len(sample) | |
36 | + if len(sample)==size: | |
37 | + break | |
38 | + with open(("%s-%d-%d"%(output,min_profile,max_profile)),'w') as f: | |
39 | + for s in sample: | |
40 | + f.write(s+'\n') | |
41 | + | |
42 | +if __name__ == '__main__': | |
43 | + popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps")) | |
44 | + print ("Popcon repository size: %d" % popcon.get_doccount()) | |
45 | + try: | |
46 | + min_profile = int(sys.argv[1]) | |
47 | + max_profile = int(sys.argv[2]) | |
48 | + size = int(sys.argv[3]) | |
49 | + except: | |
50 | + print "Usage: sample-popcon min_profile max_profile sample_size" | |
51 | + exit(1) | |
52 | + sample_file = "results/misc-popcon/sample" | |
53 | + extract_sample(size,popcon,min_profile,max_profile,sample_file) | ... | ... |
src/recommender.py
... | ... | @@ -75,20 +75,20 @@ class Recommender: |
75 | 75 | """ |
76 | 76 | self.cfg = cfg |
77 | 77 | # Load xapian indexes |
78 | - self.axi_programs = xapian.Database(cfg.axi_programs) | |
78 | + #self.axi_programs = xapian.Database(cfg.axi_programs) | |
79 | 79 | self.axi_desktopapps = xapian.Database(cfg.axi_desktopapps) |
80 | 80 | if cfg.popcon: |
81 | - self.popcon_programs = xapian.Database(cfg.popcon_programs) | |
81 | + #self.popcon_programs = xapian.Database(cfg.popcon_programs) | |
82 | 82 | self.popcon_desktopapps = xapian.Database(cfg.popcon_desktopapps) |
83 | 83 | # Load valid programs, desktopapps and tags |
84 | 84 | # format: one package or tag name per line |
85 | - self.valid_programs = [] | |
85 | + #self.valid_programs = [] | |
86 | 86 | self.valid_desktopapps = [] |
87 | 87 | self.valid_tags = [] |
88 | 88 | logging.info("Loading recommender filters") |
89 | - with open(os.path.join(cfg.filters_dir,"programs")) as pkgs: | |
90 | - self.valid_programs = [line.strip() for line in pkgs | |
91 | - if not line.startswith("#")] | |
89 | + #with open(os.path.join(cfg.filters_dir,"programs")) as pkgs: | |
90 | + # self.valid_programs = [line.strip() for line in pkgs | |
91 | + # if not line.startswith("#")] | |
92 | 92 | with open(os.path.join(cfg.filters_dir,"desktopapps")) as pkgs: |
93 | 93 | self.valid_desktopapps = [line.strip() for line in pkgs |
94 | 94 | if not line.startswith("#")] |
... | ... | @@ -109,19 +109,21 @@ class Recommender: |
109 | 109 | Set the recommendation strategy. |
110 | 110 | """ |
111 | 111 | logging.info("Setting recommender strategy to \'%s\'" % strategy_str) |
112 | - if self.cfg.pkgs_filter.split("/")[-1] == "desktopapps": | |
113 | - self.items_repository = self.axi_desktopapps | |
114 | - self.valid_pkgs = self.valid_desktopapps | |
115 | - else: | |
116 | - self.items_repository = self.axi_programs | |
117 | - self.valid_pkgs = self.valid_programs | |
118 | 112 | # Check if collaborative strategies can be instanciated |
119 | - if ("col" in strategy_str) or ("knn" in strategy_str): | |
113 | + if "knn" in strategy_str: | |
120 | 114 | if not self.cfg.popcon: |
121 | 115 | logging.info("Cannot perform collaborative strategy") |
122 | 116 | return 1 |
123 | - else: | |
124 | - self.users_repository = self.popcon_programs | |
117 | + #if self.cfg.pkgs_filter.split("/")[-1] == "desktopapps": | |
118 | + self.items_repository = self.axi_desktopapps | |
119 | + self.valid_pkgs = self.valid_desktopapps | |
120 | + if "knn" in strategy_str: | |
121 | + self.users_repository = self.popcon_desktopapps | |
122 | + #else: | |
123 | + # self.items_repository = self.axi_programs | |
124 | + # self.valid_pkgs = self.valid_programs | |
125 | + # if "knn" in strategy_str: | |
126 | + # self.users_repository = self.popcon_programs | |
125 | 127 | # Set strategy based on strategy_str |
126 | 128 | if strategy_str == "cb": |
127 | 129 | self.strategy = strategy.ContentBased("mix",self.cfg.profile_size) |
... | ... | @@ -151,8 +153,9 @@ class Recommender: |
151 | 153 | self.strategy = strategy.KnnContent(self.cfg.k_neighbors) |
152 | 154 | elif strategy_str == "knnco_eset": |
153 | 155 | self.strategy = strategy.KnnContentEset(self.cfg.k_neighbors) |
154 | - elif strategy_str.startswith("demo"): | |
155 | - self.strategy = strategy.Demographic(strategy_str) | |
156 | + # [FIXME: fix repository instanciation] | |
157 | + #elif strategy_str.startswith("demo"): | |
158 | + # self.strategy = strategy.Demographic(strategy_str) | |
156 | 159 | else: |
157 | 160 | logging.info("Strategy not defined.") |
158 | 161 | return | ... | ... |
src/strategy.py
src/user.py
... | ... | @@ -111,7 +111,7 @@ class User: |
111 | 111 | """ |
112 | 112 | Define a user of a recommender. |
113 | 113 | """ |
114 | - def __init__(self,item_score,user_id=0,demo_profiles_set=0): | |
114 | + def __init__(self,item_score,user_id=0,arch=0,demo_profiles_set=0): | |
115 | 115 | """ |
116 | 116 | Set initial user attributes. pkg_profile gets the whole set of items, |
117 | 117 | a random user_id is set if none was provided and the demographic |
... | ... | @@ -119,6 +119,7 @@ class User: |
119 | 119 | """ |
120 | 120 | self.item_score = item_score |
121 | 121 | self.pkg_profile = self.items() |
122 | + self.arch = arch | |
122 | 123 | |
123 | 124 | if user_id: |
124 | 125 | self.user_id = user_id |
... | ... | @@ -272,21 +273,28 @@ class User: |
272 | 273 | return self.pkg_profile |
273 | 274 | |
274 | 275 | class RandomPopcon(User): |
275 | - def __init__(self,submissions_dir,pkgs_filter=0): | |
276 | + def __init__(self,submissions_dir,arch=0,pkgs_filter=0): | |
276 | 277 | """ |
277 | 278 | Set initial parameters. |
278 | 279 | """ |
279 | 280 | len_profile = 0 |
280 | - while len_profile < 100: | |
281 | + match_arch = False | |
282 | + while len_profile < 100 or not match_arch: | |
281 | 283 | path = random.choice([os.path.join(root, submission) for |
282 | 284 | root, dirs, files in os.walk(submissions_dir) |
283 | 285 | for submission in files]) |
284 | 286 | user = PopconSystem(path) |
287 | + print arch | |
288 | + print user.arch | |
289 | + if arch and user.arch==arch: | |
290 | + match_arch = True | |
291 | + print "match" | |
285 | 292 | if pkgs_filter: |
286 | 293 | user.filter_pkg_profile(pkgs_filter) |
287 | 294 | len_profile = len(user.pkg_profile) |
295 | + print "p",len_profile | |
288 | 296 | submission = data.PopconSubmission(path) |
289 | - User.__init__(self,submission.packages,submission.user_id) | |
297 | + User.__init__(self,submission.packages,submission.user_id,submission.arch) | |
290 | 298 | |
291 | 299 | class PopconSystem(User): |
292 | 300 | def __init__(self,path,user_id=0): |
... | ... | @@ -296,7 +304,7 @@ class PopconSystem(User): |
296 | 304 | submission = data.PopconSubmission(path) |
297 | 305 | if not user_id: |
298 | 306 | user_id = submission.user_id |
299 | - User.__init__(self,submission.packages,user_id) | |
307 | + User.__init__(self,submission.packages,user_id,submission.arch) | |
300 | 308 | |
301 | 309 | class PkgsListSystem(User): |
302 | 310 | def __init__(self,pkgs_list_or_file,user_id=0): | ... | ... |
... | ... | @@ -36,7 +36,7 @@ button below. |
36 | 36 | </div> |
37 | 37 | |
38 | 38 | |
39 | -<form action="/save" method="post" enctype="multipart/form-data" name="surveyform"> | |
39 | +<form action="save" method="post" enctype="multipart/form-data" name="surveyform"> | |
40 | 40 | |
41 | 41 | <input type="hidden" name="user_id" value=$request.user.user_id> |
42 | 42 | <input type="hidden" name="strategy" value=$request.strategy> | ... | ... |