Commit 6d9bfe1d7e44ab36152b3b97fd12208d56f27dfb
Exists in
master
and in
1 other branch
Merge branch 'master' of github.com:tassia/AppRecommender
Showing
28 changed files
with
1920 additions
and
62 deletions
Show diff stats
src/bin/cross_validation.py
@@ -37,7 +37,7 @@ if __name__ == '__main__': | @@ -37,7 +37,7 @@ if __name__ == '__main__': | ||
37 | #user = LocalSystem() | 37 | #user = LocalSystem() |
38 | #user = RandomPopcon(cfg.popcon_dir) | 38 | #user = RandomPopcon(cfg.popcon_dir) |
39 | #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps")) | 39 | #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps")) |
40 | - user = PopconSystem("/home/tassia/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623") | 40 | + user = PopconSystem(os.path.expanduser("~/.app-recommender/popcon-entries/00/0001166d0737c6dffb083071e5ee69f5")) |
41 | user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps")) | 41 | user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps")) |
42 | user.maximal_pkg_profile() | 42 | user.maximal_pkg_profile() |
43 | begin_time = datetime.datetime.now() | 43 | begin_time = datetime.datetime.now() |
@@ -48,7 +48,7 @@ if __name__ == '__main__': | @@ -48,7 +48,7 @@ if __name__ == '__main__': | ||
48 | metrics.append(F_score(0.5)) | 48 | metrics.append(F_score(0.5)) |
49 | metrics.append(Accuracy()) | 49 | metrics.append(Accuracy()) |
50 | metrics.append(FPR()) | 50 | metrics.append(FPR()) |
51 | - validation = CrossValidation(0.9,10,rec,metrics,1) | 51 | + validation = CrossValidation(0.9,20,rec,metrics,0.005) |
52 | validation.run(user) | 52 | validation.run(user) |
53 | print validation | 53 | print validation |
54 | 54 |
@@ -0,0 +1,42 @@ | @@ -0,0 +1,42 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + AppRecommender - A GNU/Linux application recommender | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import os | ||
23 | +import sys | ||
24 | +sys.path.insert(0,'../') | ||
25 | +import xapian | ||
26 | + | ||
27 | +if __name__ == '__main__': | ||
28 | + if len(sys.argv)<2: | ||
29 | + print "Usage: get_axipkgs index_path" | ||
30 | + exit(1) | ||
31 | + | ||
32 | + axi_path = sys.argv[1] | ||
33 | + axi = xapian.Database(axi_path) | ||
34 | + for n in range(1,axi.get_lastdocid()): | ||
35 | + doc = 0 | ||
36 | + try: | ||
37 | + doc = axi.get_document(n) | ||
38 | + except: | ||
39 | + pass | ||
40 | + if doc: | ||
41 | + xp_terms = [t.term for t in doc.termlist() if t.term.startswith("XP")] | ||
42 | + print xp_terms[0].lstrip('XP') |
src/bin/get_desktop.sh
1 | #!/usr/bin/env bash | 1 | #!/usr/bin/env bash |
2 | # | 2 | # |
3 | -# get_desktop.sh - get packages which have desktop files | 3 | +# get_desktop.sh - get packages which have desktop files |
4 | +# | ||
5 | +# DEPRECATED: use get_axipkgs.py to get this info from axi | ||
4 | 6 | ||
5 | cd /usr/share/app-install/desktop | 7 | cd /usr/share/app-install/desktop |
6 | sed -ne 's/X-AppInstall-Package=//p' * | sort -u | grep -v kdelibs | grep -v libfm-gtk0 | 8 | sed -ne 's/X-AppInstall-Package=//p' * | sort -u | grep -v kdelibs | grep -v libfm-gtk0 |
src/bin/get_pkgs_inst.py
1 | #!/usr/bin/env python | 1 | #!/usr/bin/env python |
2 | # | 2 | # |
3 | # get_pkgs_inst.py - get tuple (package,installation) from popcon results file | 3 | # get_pkgs_inst.py - get tuple (package,installation) from popcon results file |
4 | +# | ||
5 | +# results_file: org/popcon.debian.org/popcon-mail/results | ||
4 | 6 | ||
7 | +import sys | ||
5 | from operator import itemgetter | 8 | from operator import itemgetter |
9 | + | ||
6 | if __name__ == '__main__': | 10 | if __name__ == '__main__': |
11 | + if len(sys.argv)<2: | ||
12 | + print "Usage: get_pkgs_inst popcon_results_path" | ||
13 | + exit(1) | ||
14 | + | ||
15 | + results_path = sys.argv[1] | ||
7 | pkgs_inst = {} | 16 | pkgs_inst = {} |
8 | - with open("/root/org/popcon.debian.org/popcon-mail/results") as results: | 17 | + with open(results_path) as results: |
9 | for line in results: | 18 | for line in results: |
10 | if line.startswith("Package"): | 19 | if line.startswith("Package"): |
11 | fields = line.split() | 20 | fields = line.split() |
12 | inst = int(fields[2])+int(fields[3])+int(fields[4]) | 21 | inst = int(fields[2])+int(fields[3])+int(fields[4]) |
13 | - if inst > 20: | ||
14 | - pkgs_inst[fields[1]] = inst | 22 | + pkgs_inst[fields[1]] = inst |
15 | sorted_by_inst = sorted(pkgs_inst.items(), key=itemgetter(1)) | 23 | sorted_by_inst = sorted(pkgs_inst.items(), key=itemgetter(1)) |
16 | for pkg, inst in sorted_by_inst: | 24 | for pkg, inst in sorted_by_inst: |
17 | print pkg, inst | 25 | print pkg, inst |
@@ -0,0 +1,77 @@ | @@ -0,0 +1,77 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + indexer.py - generate xapian indexes to be used as items and users | ||
4 | + repositories | ||
5 | +""" | ||
6 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
7 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
8 | +__license__ = """ | ||
9 | + This program is free software: you can redistribute it and/or modify | ||
10 | + it under the terms of the GNU General Public License as published by | ||
11 | + the Free Software Foundation, either version 3 of the License, or | ||
12 | + (at your option) any later version. | ||
13 | + | ||
14 | + This program is distributed in the hope that it will be useful, | ||
15 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | + GNU General Public License for more details. | ||
18 | + | ||
19 | + You should have received a copy of the GNU General Public License | ||
20 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
21 | +""" | ||
22 | + | ||
23 | +import os | ||
24 | +import sys | ||
25 | +sys.path.insert(0,'../') | ||
26 | +import datetime | ||
27 | + | ||
28 | +from config import Config | ||
29 | +from error import Error | ||
30 | +import data | ||
31 | +import xapian | ||
32 | + | ||
33 | +if __name__ == '__main__': | ||
34 | + axi_path = "/var/lib/apt-xapian-index/index" | ||
35 | + axi = xapian.Database(axi_path) | ||
36 | + base_dir = os.path.expanduser("~/.app-recommender/") | ||
37 | + | ||
38 | + begin_time = datetime.datetime.now() | ||
39 | + | ||
40 | + # axi sample based on the pkgs sample provided by command line | ||
41 | + if "sample" in sys.argv: | ||
42 | + print ("Sample package indexing started at %s" % begin_time) | ||
43 | + if len(sys.argv) > 2: | ||
44 | + pkgs_filter = sys.argv[2] | ||
45 | + else: | ||
46 | + print "Usage: indexer axi_sample pkgs_sample_file" | ||
47 | + exit(1) | ||
48 | + with open(pkgs_filter) as valid: | ||
49 | + pkgs_list = [line.strip() for line in valid] | ||
50 | + filter_str = pkgs_filter.split("/")[-1] | ||
51 | + index = data.SampleAptXapianIndex(pkgs_list,axi, | ||
52 | + os.path.join(base_dir,"axi_"+filter_str)) | ||
53 | + print ("Axi size: %d" % axi.get_doccount()) | ||
54 | + print ("Packages list length: %d" % len(pkgs_list)) | ||
55 | + print ("Sample index size: %d" % | ||
56 | + index.get_doccount()) | ||
57 | + | ||
58 | + # axi filtered by terms provided by command line | ||
59 | + if "filter" in sys.argv: | ||
60 | + print ("Filtered package indexing started at %s" % begin_time) | ||
61 | + if len(sys.argv) > 2: | ||
62 | + terms = sys.argv[2:] | ||
63 | + else: | ||
64 | + print ("Usage: indexer axi_filter term [additional terms]") | ||
65 | + exit(1) | ||
66 | + terms_str = "_".join([t.split("::")[-1] for t in terms]) | ||
67 | + index = data.FilteredXapianIndex(terms,axi, | ||
68 | + os.path.join(base_dir,"axi_"+terms_str)) | ||
69 | + print ("Axi size: %d" % axi.get_doccount()) | ||
70 | + print ("Terms filter: %s" % terms) | ||
71 | + print ("Filtered index size: %d" % | ||
72 | + index.get_doccount()) | ||
73 | + | ||
74 | + end_time = datetime.datetime.now() | ||
75 | + print ("Indexing completed at %s" % end_time) | ||
76 | + delta = end_time - begin_time | ||
77 | + print ("Time elapsed: %d seconds." % delta.seconds) |
@@ -0,0 +1,52 @@ | @@ -0,0 +1,52 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + popindex.py - generate a popcon index to be used by the recommender as the | ||
4 | + users repository, based on filters provided by config | ||
5 | +""" | ||
6 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
7 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
8 | +__license__ = """ | ||
9 | + This program is free software: you can redistribute it and/or modify | ||
10 | + it under the terms of the GNU General Public License as published by | ||
11 | + the Free Software Foundation, either version 3 of the License, or | ||
12 | + (at your option) any later version. | ||
13 | + | ||
14 | + This program is distributed in the hope that it will be useful, | ||
15 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | + GNU General Public License for more details. | ||
18 | + | ||
19 | + You should have received a copy of the GNU General Public License | ||
20 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
21 | +""" | ||
22 | +import os | ||
23 | +import sys | ||
24 | +sys.path.insert(0,'../') | ||
25 | +import logging | ||
26 | +import datetime | ||
27 | + | ||
28 | +from config import Config | ||
29 | +from data import FilteredPopconXapianIndex | ||
30 | + | ||
31 | +if __name__ == '__main__': | ||
32 | + base_dir = os.path.expanduser("~/.app-recommender/") | ||
33 | + axi_path = os.path.join(base_dir,"axi_XD") | ||
34 | + path = os.path.join(base_dir,"popcon_XD") | ||
35 | + popcon_dir = os.path.join(base_dir,"popcon-entries") | ||
36 | + tags_filter = os.path.join(base_dir,"filters/debtags") | ||
37 | + | ||
38 | + # set up config for logging | ||
39 | + cfg = Config() | ||
40 | + | ||
41 | + begin_time = datetime.datetime.now() | ||
42 | + logging.info("Popcon indexing started at %s" % begin_time) | ||
43 | + # use config file or command line options | ||
44 | + index = FilteredPopconXapianIndex(path,popcon_dir,axi_path,tags_filter) | ||
45 | + | ||
46 | + end_time = datetime.datetime.now() | ||
47 | + logging.info("Popcon indexing completed at %s" % end_time) | ||
48 | + logging.info("Number of documents (submissions): %d" % | ||
49 | + index.get_doccount()) | ||
50 | + | ||
51 | + delta = end_time - begin_time | ||
52 | + logging.info("Time elapsed: %d seconds." % delta.seconds) |
src/config.py
@@ -40,7 +40,7 @@ class Config(Singleton): | @@ -40,7 +40,7 @@ class Config(Singleton): | ||
40 | ## general options | 40 | ## general options |
41 | self.debug = 0 | 41 | self.debug = 0 |
42 | self.verbose = 1 | 42 | self.verbose = 1 |
43 | - self.output = "log" | 43 | + self.output = "apprec.log" |
44 | 44 | ||
45 | ## data_source options | 45 | ## data_source options |
46 | self.base_dir = os.path.expanduser("~/.app-recommender/") | 46 | self.base_dir = os.path.expanduser("~/.app-recommender/") |
@@ -103,13 +103,14 @@ class Config(Singleton): | @@ -103,13 +103,14 @@ class Config(Singleton): | ||
103 | print " -f, --filtersdir=PATH Path to filters directory" | 103 | print " -f, --filtersdir=PATH Path to filters directory" |
104 | print " -b, --pkgsfilter=FILTER File containing packages to be considered for recommendations" | 104 | print " -b, --pkgsfilter=FILTER File containing packages to be considered for recommendations" |
105 | print " -a, --axi=PATH Path to apt-xapian-index" | 105 | print " -a, --axi=PATH Path to apt-xapian-index" |
106 | - print " -e, --dde=URL DDE url" | ||
107 | print " -p, --popconindex=PATH Path to popcon index" | 106 | print " -p, --popconindex=PATH Path to popcon index" |
108 | - print " -m, --popcondir=PATH Path to popcon submissions dir" | ||
109 | - print " -u, --indexmode=MODE 'old'|'reindex'|'cluster'|'recluster'" | ||
110 | - print " -l, --clustersdir=PATH Path to popcon clusters dir" | ||
111 | - print " -c, --medoids=k Number of medoids for clustering" | ||
112 | - print " -x, --maxpopcon=k Number of submissions to be considered" | 107 | + print " -e, --dde=URL DDE url" |
108 | + # deprecated options | ||
109 | + #print " -m, --popcondir=PATH Path to popcon submissions dir" | ||
110 | + #print " -u, --indexmode=MODE 'old'|'reindex'|'cluster'|'recluster'" | ||
111 | + #print " -l, --clustersdir=PATH Path to popcon clusters dir" | ||
112 | + #print " -c, --medoids=k Number of medoids for clustering" | ||
113 | + #print " -x, --maxpopcon=k Number of submissions to be considered" | ||
113 | print "" | 114 | print "" |
114 | print " [ recommender ]" | 115 | print " [ recommender ]" |
115 | print " -w, --weight=OPTION Search weighting scheme" | 116 | print " -w, --weight=OPTION Search weighting scheme" |
@@ -123,11 +124,19 @@ class Config(Singleton): | @@ -123,11 +124,19 @@ class Config(Singleton): | ||
123 | print " bm25 = bm25 weighting scheme" | 124 | print " bm25 = bm25 weighting scheme" |
124 | print "" | 125 | print "" |
125 | print " [ strategy options ] " | 126 | print " [ strategy options ] " |
126 | - print " cb = content-based " | ||
127 | - print " cbt = content-based using only tags as content " | ||
128 | - print " cbd = content-based using only package descriptions as content " | ||
129 | - print " col = collaborative " | ||
130 | - print " colct = collaborative through tags content " | 127 | + print " cb = content-based, mixed profile" |
128 | + print " cbt = content-based, tags only profile" | ||
129 | + print " cbd = content-based, description terms only profile" | ||
130 | + print " cbh = content-based, half-half profile" | ||
131 | + print " cb_eset = cb with eset profiling" | ||
132 | + print " cbt_eset = cbt with eset profiling" | ||
133 | + print " cbd_eset = cbd_eset with eset profiling" | ||
134 | + print " cbh_eset = cbh with eset profiling" | ||
135 | + print " knn = collaborative, tf-idf knn" | ||
136 | + print " knn_plus = collaborative, tf-idf weighted knn" | ||
137 | + print " knn_eset = collaborative, eset knn" | ||
138 | + print " knnco = collaborative through content" | ||
139 | + print " knnco_eset = collaborative through content, eset recommendation" | ||
131 | 140 | ||
132 | def read_option(self, section, option): | 141 | def read_option(self, section, option): |
133 | """ | 142 | """ |
src/data.py
@@ -30,12 +30,26 @@ import shutil | @@ -30,12 +30,26 @@ import shutil | ||
30 | import apt | 30 | import apt |
31 | import re | 31 | import re |
32 | import operator | 32 | import operator |
33 | +import urllib | ||
34 | +import simplejson as json | ||
33 | 35 | ||
34 | from error import Error | 36 | from error import Error |
35 | from singleton import Singleton | 37 | from singleton import Singleton |
36 | from dissimilarity import * | 38 | from dissimilarity import * |
37 | from config import Config | 39 | from config import Config |
38 | 40 | ||
41 | +def axi_get_pkgs(axi): | ||
42 | + pkgs_names = [] | ||
43 | + for docid in range(1,axi.get_lastdocid()+1): | ||
44 | + try: | ||
45 | + doc = axi.get_document(docid) | ||
46 | + except: | ||
47 | + pass | ||
48 | + docterms_XP = [t.term for t in doc.termlist() | ||
49 | + if t.term.startswith("XP")] | ||
50 | + pkgs_names.append(docterms_XP[0].lstrip('XP')) | ||
51 | + return pkgs_names | ||
52 | + | ||
39 | def axi_search_pkgs(axi,pkgs_list): | 53 | def axi_search_pkgs(axi,pkgs_list): |
40 | terms = ["XP"+item for item in pkgs_list] | 54 | terms = ["XP"+item for item in pkgs_list] |
41 | query = xapian.Query(xapian.Query.OP_OR, terms) | 55 | query = xapian.Query(xapian.Query.OP_OR, terms) |
@@ -110,30 +124,39 @@ def tfidf_plus(index,docs,content_filter): | @@ -110,30 +124,39 @@ def tfidf_plus(index,docs,content_filter): | ||
110 | variance = sum([(p-mean)*(p-mean) for p in population])/len(population) | 124 | variance = sum([(p-mean)*(p-mean) for p in population])/len(population) |
111 | standard_deviation = math.sqrt(variance) | 125 | standard_deviation = math.sqrt(variance) |
112 | for d in docs: | 126 | for d in docs: |
113 | - normalized_weigths[d.docid] = d.weight/standard_deviation | 127 | + if standard_deviation>1: |
128 | + # values between [0-1] would cause the opposite effect | ||
129 | + normalized_weigths[d.docid] = d.weight/standard_deviation | ||
130 | + else: | ||
131 | + normalized_weigths[d.docid] = d.weight | ||
114 | return tfidf_weighting(index,docs,content_filter,normalized_weigths) | 132 | return tfidf_weighting(index,docs,content_filter,normalized_weigths) |
115 | 133 | ||
116 | -class AppAptXapianIndex(xapian.WritableDatabase): | 134 | +class FilteredXapianIndex(xapian.WritableDatabase): |
117 | """ | 135 | """ |
118 | - Data source for application packages information | 136 | + Filtered Xapian Index |
119 | """ | 137 | """ |
120 | - def __init__(self,axi_path,path): | 138 | + def __init__(self,terms,index_path,path): |
121 | xapian.WritableDatabase.__init__(self,path, | 139 | xapian.WritableDatabase.__init__(self,path, |
122 | xapian.DB_CREATE_OR_OVERWRITE) | 140 | xapian.DB_CREATE_OR_OVERWRITE) |
123 | - axi = xapian.Database(axi_path) | ||
124 | - logging.info("AptXapianIndex size: %d" % axi.get_doccount()) | ||
125 | - for docid in range(1,axi.get_lastdocid()+1): | 141 | + index = xapian.Database(index_path) |
142 | + for docid in range(1,index.get_lastdocid()+1): | ||
126 | try: | 143 | try: |
127 | - doc = axi.get_document(docid) | ||
128 | - allterms = [term.term for term in doc.termlist()] | ||
129 | - if "XTrole::program" in allterms: | 144 | + doc = index.get_document(docid) |
145 | + docterms = [term.term for term in doc.termlist()] | ||
146 | + tagged = False | ||
147 | + for t in terms: | ||
148 | + if t in docterms: | ||
149 | + tagged = True | ||
150 | + if tagged: | ||
130 | self.add_document(doc) | 151 | self.add_document(doc) |
131 | logging.info("Added doc %d." % docid) | 152 | logging.info("Added doc %d." % docid) |
132 | else: | 153 | else: |
133 | logging.info("Discarded doc %d." % docid) | 154 | logging.info("Discarded doc %d." % docid) |
134 | except: | 155 | except: |
135 | logging.info("Doc %d not found in axi." % docid) | 156 | logging.info("Doc %d not found in axi." % docid) |
136 | - logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." % | 157 | + logging.info("Filter: %s" % terms) |
158 | + logging.info("Index size: %d" % index.get_doccount()) | ||
159 | + logging.info("Filtered Index size: %d (lastdocid: %d)." % | ||
137 | (self.get_doccount(), self.get_lastdocid())) | 160 | (self.get_doccount(), self.get_lastdocid())) |
138 | 161 | ||
139 | def __str__(self): | 162 | def __str__(self): |
@@ -186,13 +209,13 @@ class DebianPackage(): | @@ -186,13 +209,13 @@ class DebianPackage(): | ||
186 | if pkg_version.record.has_key('Conflicts'): | 209 | if pkg_version.record.has_key('Conflicts'): |
187 | self.conflicts = pkg_version.record['Conflicts'] | 210 | self.conflicts = pkg_version.record['Conflicts'] |
188 | if pkg_version.record.has_key('Replaces'): | 211 | if pkg_version.record.has_key('Replaces'): |
189 | - self.conflicts = pkg_version.record['Replaces'] | 212 | + self.replaces = pkg_version.record['Replaces'] |
190 | if pkg_version.record.has_key('Provides'): | 213 | if pkg_version.record.has_key('Provides'): |
191 | self.provides = pkg_version.record['Provides'] | 214 | self.provides = pkg_version.record['Provides'] |
192 | 215 | ||
193 | def load_details_from_dde(self,dde_server,dde_port): | 216 | def load_details_from_dde(self,dde_server,dde_port): |
194 | - json_data = json.load(urllib.urlopen("http://%s:%s/q/udd/packages/all/%s?t=json" | ||
195 | - % dde_server,dde_port,self.name)) | 217 | + json_data = json.load(urllib.urlopen("http://%s:%d/q/udd/packages/prio-debian-sid/%s?t=json" |
218 | + % (dde_server,dde_port,self.name))) | ||
196 | 219 | ||
197 | self.maintainer = json_data['r']['maintainer'] | 220 | self.maintainer = json_data['r']['maintainer'] |
198 | self.version = json_data['r']['version'] | 221 | self.version = json_data['r']['version'] |
@@ -200,27 +223,27 @@ class DebianPackage(): | @@ -200,27 +223,27 @@ class DebianPackage(): | ||
200 | self.description = self.format_description(json_data['r']['long_description']) | 223 | self.description = self.format_description(json_data['r']['long_description']) |
201 | self.section = json_data['r']['section'] | 224 | self.section = json_data['r']['section'] |
202 | if json_data['r']['homepage']: | 225 | if json_data['r']['homepage']: |
203 | - self.conflicts = json_data['r']['homepage'] | 226 | + self.homepage = json_data['r']['homepage'] |
204 | if json_data['r']['tag']: | 227 | if json_data['r']['tag']: |
205 | self.tags = self.debtags_list_to_dict(json_data['r']['tag']) | 228 | self.tags = self.debtags_list_to_dict(json_data['r']['tag']) |
206 | if json_data['r']['depends']: | 229 | if json_data['r']['depends']: |
207 | self.depends = json_data['r']['depends'] | 230 | self.depends = json_data['r']['depends'] |
208 | if json_data['r']['pre_depends']: | 231 | if json_data['r']['pre_depends']: |
209 | - self.conflicts = json_data['r']['pre_depends'] | 232 | + self.predepends = json_data['r']['pre_depends'] |
210 | if json_data['r']['recommends']: | 233 | if json_data['r']['recommends']: |
211 | - self.conflicts = json_data['r']['recommends'] | 234 | + self.recommends = json_data['r']['recommends'] |
212 | if json_data['r']['suggests']: | 235 | if json_data['r']['suggests']: |
213 | - self.conflicts = json_data['r']['suggests'] | 236 | + self.suggests = json_data['r']['suggests'] |
214 | if json_data['r']['conflicts']: | 237 | if json_data['r']['conflicts']: |
215 | self.conflicts = json_data['r']['conflicts'] | 238 | self.conflicts = json_data['r']['conflicts'] |
216 | if json_data['r']['replaces']: | 239 | if json_data['r']['replaces']: |
217 | - self.conflicts = json_data['r']['replaces'] | 240 | + self.replaces = json_data['r']['replaces'] |
218 | if json_data['r']['provides']: | 241 | if json_data['r']['provides']: |
219 | - self.conflicts = json_data['r']['provides'] | 242 | + self.provides = json_data['r']['provides'] |
220 | self.popcon_insts = json_data['r']['popcon']['insts'] | 243 | self.popcon_insts = json_data['r']['popcon']['insts'] |
221 | 244 | ||
222 | def format_description(self,description): | 245 | def format_description(self,description): |
223 | - return description.replace('.\n','').replace('\n','<br />') | 246 | + return description.replace(' .\n','<br />').replace('\n','<br />') |
224 | 247 | ||
225 | def debtags_str_to_dict(self, debtags_str): | 248 | def debtags_str_to_dict(self, debtags_str): |
226 | debtags_list = [tag.rstrip(",") for tag in debtags_str.split()] | 249 | debtags_list = [tag.rstrip(",") for tag in debtags_str.split()] |
@@ -281,6 +304,7 @@ class PopconSubmission(): | @@ -281,6 +304,7 @@ class PopconSubmission(): | ||
281 | for line in submission: | 304 | for line in submission: |
282 | if line.startswith("POPULARITY"): | 305 | if line.startswith("POPULARITY"): |
283 | self.user_id = line.split()[2].lstrip("ID:") | 306 | self.user_id = line.split()[2].lstrip("ID:") |
307 | + self.arch = line.split()[3].lstrip("ARCH:") | ||
284 | elif not line.startswith("END-POPULARITY"): | 308 | elif not line.startswith("END-POPULARITY"): |
285 | data = line.rstrip('\n').split() | 309 | data = line.rstrip('\n').split() |
286 | if len(data) > 2: | 310 | if len(data) > 2: |
@@ -304,6 +328,82 @@ class PopconSubmission(): | @@ -304,6 +328,82 @@ class PopconSubmission(): | ||
304 | elif data[4] == '<RECENT-CTIME>': | 328 | elif data[4] == '<RECENT-CTIME>': |
305 | self.packages[pkg] = 8 | 329 | self.packages[pkg] = 8 |
306 | 330 | ||
331 | +class FilteredPopconXapianIndex(xapian.WritableDatabase): | ||
332 | + """ | ||
333 | + Data source for popcon submissions defined as a xapian database. | ||
334 | + """ | ||
335 | + def __init__(self,path,popcon_dir,axi_path,tags_filter): | ||
336 | + """ | ||
337 | + Set initial attributes. | ||
338 | + """ | ||
339 | + self.axi = xapian.Database(axi_path) | ||
340 | + self.path = os.path.expanduser(path) | ||
341 | + self.popcon_dir = os.path.expanduser(popcon_dir) | ||
342 | + self.valid_pkgs = axi_get_pkgs(self.axi) | ||
343 | + logging.debug("Considering %d valid packages" % len(self.valid_pkgs)) | ||
344 | + with open(tags_filter) as valid_tags: | ||
345 | + self.valid_tags = [line.strip() for line in valid_tags | ||
346 | + if not line.startswith("#")] | ||
347 | + logging.debug("Considering %d valid tags" % len(self.valid_tags)) | ||
348 | + if not os.path.exists(self.popcon_dir): | ||
349 | + os.makedirs(self.popcon_dir) | ||
350 | + if not os.listdir(self.popcon_dir): | ||
351 | + logging.critical("Popcon dir seems to be empty.") | ||
352 | + raise Error | ||
353 | + | ||
354 | + # set up directory | ||
355 | + shutil.rmtree(self.path,1) | ||
356 | + os.makedirs(self.path) | ||
357 | + try: | ||
358 | + logging.info("Indexing popcon submissions from \'%s\'" % | ||
359 | + self.popcon_dir) | ||
360 | + logging.info("Creating new xapian index at \'%s\'" % | ||
361 | + self.path) | ||
362 | + xapian.WritableDatabase.__init__(self,self.path, | ||
363 | + xapian.DB_CREATE_OR_OVERWRITE) | ||
364 | + except xapian.DatabaseError as e: | ||
365 | + logging.critical("Could not create popcon xapian index.") | ||
366 | + logging.critical(str(e)) | ||
367 | + raise Error | ||
368 | + | ||
369 | + # build new index | ||
370 | + doc_count = 0 | ||
371 | + for root, dirs, files in os.walk(self.popcon_dir): | ||
372 | + for popcon_file in files: | ||
373 | + submission = PopconSubmission(os.path.join(root, popcon_file)) | ||
374 | + doc = xapian.Document() | ||
375 | + submission_pkgs = submission.get_filtered(self.valid_pkgs) | ||
376 | + if len(submission_pkgs) < 10: | ||
377 | + logging.debug("Low profile popcon submission \'%s\' (%d)" % | ||
378 | + (submission.user_id,len(submission_pkgs))) | ||
379 | + else: | ||
380 | + doc.set_data(submission.user_id) | ||
381 | + doc.add_term("ID"+submission.user_id) | ||
382 | + doc.add_term("ARCH"+submission.arch) | ||
383 | + logging.debug("Parsing popcon submission \'%s\'" % | ||
384 | + submission.user_id) | ||
385 | + for pkg,freq in submission_pkgs.items(): | ||
386 | + tags = axi_search_pkg_tags(self.axi,pkg) | ||
387 | + # if the package was found in axi | ||
388 | + if tags: | ||
389 | + doc.add_term("XP"+pkg,freq) | ||
390 | + # if the package has tags associated with it | ||
391 | + if not tags == "notags": | ||
392 | + for tag in tags: | ||
393 | + if tag.lstrip("XT") in self.valid_tags: | ||
394 | + doc.add_term(tag,freq) | ||
395 | + doc_id = self.add_document(doc) | ||
396 | + doc_count += 1 | ||
397 | + logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) | ||
398 | + # python garbage collector | ||
399 | + gc.collect() | ||
400 | + # flush to disk database changes | ||
401 | + try: | ||
402 | + self.commit() | ||
403 | + except: | ||
404 | + self.flush() # deprecated function, used for compatibility with old lib version | ||
405 | + | ||
406 | +# Deprecated class, must be reviewed | ||
307 | class PopconXapianIndex(xapian.WritableDatabase): | 407 | class PopconXapianIndex(xapian.WritableDatabase): |
308 | """ | 408 | """ |
309 | Data source for popcon submissions defined as a singleton xapian database. | 409 | Data source for popcon submissions defined as a singleton xapian database. |
src/evaluation.py
@@ -140,6 +140,29 @@ class FPR(Metric): | @@ -140,6 +140,29 @@ class FPR(Metric): | ||
140 | return (float(len(evaluation.false_positive))/ | 140 | return (float(len(evaluation.false_positive))/ |
141 | evaluation.real_negative_len) | 141 | evaluation.real_negative_len) |
142 | 142 | ||
143 | +class MCC(Metric): | ||
144 | + """ | ||
145 | + Matthews correlation coefficient. | ||
146 | + """ | ||
147 | + def __init__(self): | ||
148 | + """ | ||
149 | + Set metric description. | ||
150 | + """ | ||
151 | + self.desc = " MCC " | ||
152 | + | ||
153 | + def run(self,evaluation): | ||
154 | + """ | ||
155 | + Compute metric. | ||
156 | + """ | ||
157 | + VP = len(evaluation.true_positive) | ||
158 | + FP = len(evaluation.false_positive) | ||
159 | + FN = len(evaluation.false_negative) | ||
160 | + VN = evaluation.true_negative_len | ||
161 | + if (VP+FP)==0 or (VP+FN)==0 or (VN+FP)==0 or (VN+FN)==0: | ||
162 | + return 0 | ||
163 | + MCC = (((VP*VN)-(FP*FN))/math.sqrt((VP+FP)*(VP+FN)*(VN+FP)*(VN+FN))) | ||
164 | + return MCC | ||
165 | + | ||
143 | class F_score(Metric): | 166 | class F_score(Metric): |
144 | """ | 167 | """ |
145 | Classification accuracy metric which correlates precision and recall into an | 168 | Classification accuracy metric which correlates precision and recall into an |
@@ -0,0 +1,51 @@ | @@ -0,0 +1,51 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + recommender suite - recommender experiments suite | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import sys | ||
23 | +import os | ||
24 | +sys.path.insert(0,'../') | ||
25 | +from config import Config | ||
26 | +from data import PopconXapianIndex, PopconSubmission | ||
27 | +from recommender import Recommender | ||
28 | +from user import LocalSystem, User | ||
29 | +from evaluation import * | ||
30 | +import logging | ||
31 | +import random | ||
32 | +import Gnuplot | ||
33 | + | ||
34 | +if __name__ == '__main__': | ||
35 | + | ||
36 | + cfg = Config() | ||
37 | + cfg.index_mode = "recluster" | ||
38 | + logging.info("Starting clustering experiments") | ||
39 | + logging.info("Medoids: %d\t Max popcon:%d" % (cfg.k_medoids,cfg.max_popcon)) | ||
40 | + cfg.popcon_dir = os.path.expanduser("~/org/popcon.debian.org/popcon-mail/popcon-entries/") | ||
41 | + cfg.popcon_index = cfg.popcon_index+("_%dmedoids%dmax" % | ||
42 | + (cfg.k_medoids,cfg.max_popcon)) | ||
43 | + cfg.clusters_dir = cfg.clusters_dir+("_%dmedoids%dmax" % | ||
44 | + (cfg.k_medoids,cfg.max_popcon)) | ||
45 | + pxi = PopconXapianIndex(cfg) | ||
46 | + logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion) | ||
47 | + # Write clustering log | ||
48 | + output = open(("results/clustering/%dmedoids%dmax" % (cfg.k_medoids,cfg.max_popcon)),'w') | ||
49 | + output.write("# k_medoids\tmax_popcon\tdispersion\n") | ||
50 | + output.write("%d %f\n" % (cfg.k_medoids,cfg.max_popcon,pxi.cluster_dispersion)) | ||
51 | + output.close() |
@@ -0,0 +1,27 @@ | @@ -0,0 +1,27 @@ | ||
1 | +[DEFAULT] | ||
2 | +repetitions = 1 | ||
3 | +iterations = 10 | ||
4 | +path = 'results' | ||
5 | +experiment = 'grid' | ||
6 | +weight = ['bm25', 'trad'] | ||
7 | +;profile_size = range(10,100,10) | ||
8 | +;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] | ||
9 | +sample = [0.6, 0.7, 0.8, 0.9] | ||
10 | + | ||
11 | +[content] | ||
12 | +strategy = ['cb','cbt','cbd'] | ||
13 | + | ||
14 | +[clustering] | ||
15 | +experiment = 'single' | ||
16 | +;iterations = 4 | ||
17 | +;medoids = range(2,6) | ||
18 | +iterations = 6 | ||
19 | +medoids = [100,500,1000,5000,10000,50000] | ||
20 | +;disabled for this experiment | ||
21 | +weight = 0 | ||
22 | +profile_size = 0 | ||
23 | +sample = 0 | ||
24 | + | ||
25 | +[colaborative] | ||
26 | +users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"] | ||
27 | +neighbors = range(10,1010,50) |
@@ -0,0 +1,171 @@ | @@ -0,0 +1,171 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + recommender suite - recommender experiments suite | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import expsuite | ||
23 | +import sys | ||
24 | +sys.path.insert(0,'../') | ||
25 | +from config import Config | ||
26 | +from data import PopconXapianIndex, PopconSubmission | ||
27 | +from recommender import Recommender | ||
28 | +from user import LocalSystem, User | ||
29 | +from evaluation import * | ||
30 | +import logging | ||
31 | +import random | ||
32 | +import Gnuplot | ||
33 | + | ||
34 | +class ClusteringSuite(expsuite.PyExperimentSuite): | ||
35 | + def reset(self, params, rep): | ||
36 | + self.cfg = Config() | ||
37 | + self.cfg.popcon_index = "../tests/test_data/.sample_pxi" | ||
38 | + self.cfg.popcon_dir = "../tests/test_data/popcon_dir" | ||
39 | + self.cfg.clusters_dir = "../tests/test_data/clusters_dir" | ||
40 | + | ||
41 | + if params['name'] == "clustering": | ||
42 | + logging.info("Starting 'clustering' experiments suite...") | ||
43 | + self.cfg.index_mode = "recluster" | ||
44 | + | ||
45 | + def iterate(self, params, rep, n): | ||
46 | + if params['name'] == "clustering": | ||
47 | + logging.info("Running iteration %d" % params['medoids'][n]) | ||
48 | + self.cfg.k_medoids = params['medoids'][n] | ||
49 | + pxi = PopconXapianIndex(self.cfg) | ||
50 | + result = {'k_medoids': params['medoids'][n], | ||
51 | + 'dispersion': pxi.cluster_dispersion} | ||
52 | + else: | ||
53 | + result = {} | ||
54 | + return result | ||
55 | + | ||
56 | +class ContentBasedSuite(expsuite.PyExperimentSuite): | ||
57 | + def reset(self, params, rep): | ||
58 | + if params['name'].startswith("content"): | ||
59 | + cfg = Config() | ||
60 | + #if the index was not built yet | ||
61 | + #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi") | ||
62 | + cfg.axi = "data/AppAxi" | ||
63 | + cfg.index_mode = "old" | ||
64 | + cfg.weight = params['weight'] | ||
65 | + self.rec = Recommender(cfg) | ||
66 | + self.rec.set_strategy(params['strategy']) | ||
67 | + self.repo_size = self.rec.items_repository.get_doccount() | ||
68 | + self.user = LocalSystem() | ||
69 | + self.user.app_pkg_profile(self.rec.items_repository) | ||
70 | + self.user.no_auto_pkg_profile() | ||
71 | + self.sample_size = int(len(self.user.pkg_profile)*params['sample']) | ||
72 | + # iteration should be set to 10 in config file | ||
73 | + #self.profile_size = range(10,101,10) | ||
74 | + | ||
75 | + def iterate(self, params, rep, n): | ||
76 | + if params['name'].startswith("content"): | ||
77 | + item_score = dict.fromkeys(self.user.pkg_profile,1) | ||
78 | + # Prepare partition | ||
79 | + sample = {} | ||
80 | + for i in range(self.sample_size): | ||
81 | + key = random.choice(item_score.keys()) | ||
82 | + sample[key] = item_score.pop(key) | ||
83 | + # Get full recommendation | ||
84 | + user = User(item_score) | ||
85 | + recommendation = self.rec.get_recommendation(user,self.repo_size) | ||
86 | + # Write recall log | ||
87 | + recall_file = "results/content/recall/%s-%s-%.2f-%d" % \ | ||
88 | + (params['strategy'],params['weight'],params['sample'],n) | ||
89 | + output = open(recall_file,'w') | ||
90 | + output.write("# weight=%s\n" % params['weight']) | ||
91 | + output.write("# strategy=%s\n" % params['strategy']) | ||
92 | + output.write("# sample=%f\n" % params['sample']) | ||
93 | + output.write("\n%d %d %d\n" % \ | ||
94 | + (self.repo_size,len(item_score),self.sample_size)) | ||
95 | + notfound = [] | ||
96 | + ranks = [] | ||
97 | + for pkg in sample.keys(): | ||
98 | + if pkg in recommendation.ranking: | ||
99 | + ranks.append(recommendation.ranking.index(pkg)) | ||
100 | + else: | ||
101 | + notfound.append(pkg) | ||
102 | + for r in sorted(ranks): | ||
103 | + output.write(str(r)+"\n") | ||
104 | + if notfound: | ||
105 | + output.write("Out of recommendation:\n") | ||
106 | + for pkg in notfound: | ||
107 | + output.write(pkg+"\n") | ||
108 | + output.close() | ||
109 | + # Plot metrics summary | ||
110 | + accuracy = [] | ||
111 | + precision = [] | ||
112 | + recall = [] | ||
113 | + f1 = [] | ||
114 | + g = Gnuplot.Gnuplot() | ||
115 | + g('set style data lines') | ||
116 | + g.xlabel('Recommendation size') | ||
117 | + for size in range(1,len(recommendation.ranking)+1,100): | ||
118 | + predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1)) | ||
119 | + real = RecommendationResult(sample) | ||
120 | + evaluation = Evaluation(predicted,real,self.repo_size) | ||
121 | + accuracy.append([size,evaluation.run(Accuracy())]) | ||
122 | + precision.append([size,evaluation.run(Precision())]) | ||
123 | + recall.append([size,evaluation.run(Recall())]) | ||
124 | + f1.append([size,evaluation.run(F1())]) | ||
125 | + g.plot(Gnuplot.Data(accuracy,title="Accuracy"), | ||
126 | + Gnuplot.Data(precision,title="Precision"), | ||
127 | + Gnuplot.Data(recall,title="Recall"), | ||
128 | + Gnuplot.Data(f1,title="F1")) | ||
129 | + g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1) | ||
130 | + # Iteration log | ||
131 | + result = {'iteration': n, | ||
132 | + 'weight': params['weight'], | ||
133 | + 'strategy': params['strategy'], | ||
134 | + 'accuracy': accuracy[20], | ||
135 | + 'precision': precision[20], | ||
136 | + 'recall:': recall[20], | ||
137 | + 'f1': f1[20]} | ||
138 | + return result | ||
139 | + | ||
140 | +#class CollaborativeSuite(expsuite.PyExperimentSuite): | ||
141 | +# def reset(self, params, rep): | ||
142 | +# if params['name'].startswith("collaborative"): | ||
143 | +# | ||
144 | +# def iterate(self, params, rep, n): | ||
145 | +# if params['name'].startswith("collaborative"): | ||
146 | +# for root, dirs, files in os.walk(self.source_dir): | ||
147 | +# for popcon_file in files: | ||
148 | +# submission = PopconSubmission(os.path.join(root,popcon_file)) | ||
149 | +# user = User(submission.packages) | ||
150 | +# user.maximal_pkg_profile() | ||
151 | +# rec.get_recommendation(user) | ||
152 | +# precision = 0 | ||
153 | +# result = {'weight': params['weight'], | ||
154 | +# 'strategy': params['strategy'], | ||
155 | +# 'profile_size': self.profile_size[n], | ||
156 | +# 'accuracy': accuracy, | ||
157 | +# 'precision': precision, | ||
158 | +# 'recall:': recall, | ||
159 | +# 'f1': } | ||
160 | +# else: | ||
161 | +# result = {} | ||
162 | +# return result | ||
163 | + | ||
164 | +if __name__ == '__main__': | ||
165 | + | ||
166 | + if "clustering" in sys.argv or len(sys.argv)<3: | ||
167 | + ClusteringSuite().start() | ||
168 | + if "content" in sys.argv or len(sys.argv)<3: | ||
169 | + ContentBasedSuite().start() | ||
170 | + #if "collaborative" in sys.argv or len(sys.argv)<3: | ||
171 | + #CollaborativeSuite().start() |
@@ -0,0 +1,49 @@ | @@ -0,0 +1,49 @@ | ||
1 | +#! /usr/bin/env python | ||
2 | +""" | ||
3 | + sample-popcon - extract a sample from popcon population | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import xapian | ||
23 | +import os | ||
24 | +import random | ||
25 | +import sys | ||
26 | + | ||
27 | +if __name__ == '__main__': | ||
28 | + try: | ||
29 | + sample_file = sys.argv[1] | ||
30 | + popcon = xapian.WritableDatabase(sys.argv[2],xapian.DB_OPEN) | ||
31 | + except: | ||
32 | + print "Usage: extract-sample-db sample_file popcon_index" | ||
33 | + exit(1) | ||
34 | + enquire = xapian.Enquire(popcon) | ||
35 | + print sample_file.split("/") | ||
36 | + new_popcon = xapian.WritableDatabase(sys.argv[2]+"-"+sample_file.split("/")[-1],xapian.DB_CREATE_OR_OVERWRITE) | ||
37 | + print ("Popcon repository size: %d" % popcon.get_doccount()) | ||
38 | + for submission in open(sample_file): | ||
39 | + print "ID"+submission.strip() | ||
40 | + query = xapian.Query("ID"+submission.strip()) | ||
41 | + enquire.set_query(query) | ||
42 | + mset = enquire.get_mset(0,20) | ||
43 | + for m in mset: | ||
44 | + print "Adding doc %s"%m.docid | ||
45 | + new_popcon.add_document(popcon.get_document(m.docid)) | ||
46 | + print "Removing doc %s"%m.docid | ||
47 | + popcon.delete_document(m.docid) | ||
48 | + print ("Popcon repository size: %d" % popcon.get_doccount()) | ||
49 | + print ("Popcon repository size: %d" % new_popcon.get_doccount()) |
@@ -0,0 +1,197 @@ | @@ -0,0 +1,197 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + hybrid-suite | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import sys | ||
23 | +sys.path.insert(0,'../') | ||
24 | +from config import Config | ||
25 | +from data import PopconXapianIndex, PopconSubmission | ||
26 | +from recommender import Recommender | ||
27 | +from user import LocalSystem, User | ||
28 | +from evaluation import * | ||
29 | +import logging | ||
30 | +import random | ||
31 | +import Gnuplot | ||
32 | +import numpy | ||
33 | + | ||
34 | +if __name__ == '__main__': | ||
35 | + if len(sys.argv)<2: | ||
36 | + print "Usage: hybrid strategy sample_file" | ||
37 | + exit(1) | ||
38 | + | ||
39 | + iterations = 20 | ||
40 | + profile_size = [10,40,70,100,170,240] | ||
41 | + neighbor_size = [3,10,50,100,200,400] | ||
42 | + | ||
43 | + #hybrid_strategies = ['knnco','knnco_eset'] | ||
44 | + | ||
45 | + #iterations = 1 | ||
46 | + #profile_size = [10,20,30] | ||
47 | + #neighbor_size = [10,20,30] | ||
48 | + | ||
49 | + cfg = Config() | ||
50 | + population_sample = [] | ||
51 | + strategy = sys.argv[1] | ||
52 | + sample_file = sys.argv[2] | ||
53 | + sample_str = sample_file.split('/')[-1] | ||
54 | + with open(sample_file,'r') as f: | ||
55 | + for line in f.readlines(): | ||
56 | + user_id = line.strip('\n') | ||
57 | + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id)) | ||
58 | + sample_dir = ("results/hybrid/%s" % sample_str) | ||
59 | + if not os.path.exists(sample_dir): | ||
60 | + os.makedirs(sample_dir) | ||
61 | + | ||
62 | + cfg.strategy = strategy | ||
63 | + p_20_summary = {} | ||
64 | + f05_100_summary = {} | ||
65 | + c_20 = {} | ||
66 | + c_100 = {} | ||
67 | + | ||
68 | + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy) | ||
69 | + graph_20 = {} | ||
70 | + graph_100 = {} | ||
71 | + graph_20_jpg = {} | ||
72 | + graph_100_jpg = {} | ||
73 | + comment_20 = {} | ||
74 | + comment_100 = {} | ||
75 | + for k in neighbor_size: | ||
76 | + graph_20[k] = log_file+("-neighboorhod%.3d-020.png"%k) | ||
77 | + graph_100[k] = log_file+("-neighboorhod%.3d-100.png"%k) | ||
78 | + graph_20_jpg[k] = graph_20[k].strip(".png")+".jpg" | ||
79 | + graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg" | ||
80 | + comment_20[k] = graph_20_jpg[k]+".comment" | ||
81 | + comment_100[k] = graph_100_jpg[k]+".comment" | ||
82 | + | ||
83 | + with open(comment_20[k],'w') as f: | ||
84 | + f.write("# %s\n" % sample_str) | ||
85 | + f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" % | ||
86 | + (cfg.strategy,iterations)) | ||
87 | + f.write("# neighboorhood\tprofile\tp_20\tc_20\n\n") | ||
88 | + with open(comment_100[k],'w') as f: | ||
89 | + f.write("# %s\n" % sample_str) | ||
90 | + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" % | ||
91 | + (cfg.strategy,iterations)) | ||
92 | + f.write("# neighboorhood\tprofile\tf05_100\tc_100\n\n") | ||
93 | + | ||
94 | + c_20[k] = {} | ||
95 | + c_100[k] = {} | ||
96 | + p_20_summary[k] = {} | ||
97 | + f05_100_summary[k] = {} | ||
98 | + for size in profile_size: | ||
99 | + c_20[k][size] = set() | ||
100 | + c_100[k][size] = set() | ||
101 | + p_20_summary[k][size] = [] | ||
102 | + f05_100_summary[k][size] = [] | ||
103 | + with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'w') as f: | ||
104 | + f.write("# %s\n" % sample_str) | ||
105 | + f.write("# strategy %s-neighboorhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size)) | ||
106 | + f.write("# p_20\t\tf05_100\n\n") | ||
107 | + | ||
108 | + # main loop per user | ||
109 | + for submission_file in population_sample: | ||
110 | + user = PopconSystem(submission_file) | ||
111 | + user.filter_pkg_profile(cfg.pkgs_filter) | ||
112 | + user.maximal_pkg_profile() | ||
113 | + for k in neighbor_size: | ||
114 | + cfg.k_neighbors = k | ||
115 | + for size in profile_size: | ||
116 | + cfg.profile_size = size | ||
117 | + rec = Recommender(cfg) | ||
118 | + repo_size = rec.items_repository.get_doccount() | ||
119 | + p_20 = [] | ||
120 | + f05_100 = [] | ||
121 | + for n in range(iterations): | ||
122 | + # Fill sample profile | ||
123 | + profile_len = len(user.pkg_profile) | ||
124 | + item_score = {} | ||
125 | + for pkg in user.pkg_profile: | ||
126 | + item_score[pkg] = user.item_score[pkg] | ||
127 | + sample = {} | ||
128 | + sample_size = int(profile_len*0.9) | ||
129 | + for i in range(sample_size): | ||
130 | + key = random.choice(item_score.keys()) | ||
131 | + sample[key] = item_score.pop(key) | ||
132 | + iteration_user = User(item_score) | ||
133 | + recommendation = rec.get_recommendation(iteration_user,repo_size) | ||
134 | + if hasattr(recommendation,"ranking"): | ||
135 | + ranking = recommendation.ranking | ||
136 | + real = RecommendationResult(sample) | ||
137 | + predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1)) | ||
138 | + evaluation = Evaluation(predicted_20,real,repo_size) | ||
139 | + p_20.append(evaluation.run(Precision())) | ||
140 | + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1)) | ||
141 | + evaluation = Evaluation(predicted_100,real,repo_size) | ||
142 | + f05_100.append(evaluation.run(F_score(0.5))) | ||
143 | + c_20[k][size] = c_20[k][size].union(recommendation.ranking[:20]) | ||
144 | + c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100]) | ||
145 | + # save summary | ||
146 | + if p_20: | ||
147 | + p_20_summary[k][size].append(sum(p_20)/len(p_20)) | ||
148 | + if f05_100: | ||
149 | + f05_100_summary[k][size].append(sum(f05_100)/len(f05_100)) | ||
150 | + | ||
151 | + with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'a') as f: | ||
152 | + f.write("%.4f\t\t%.4f\n" % | ||
153 | + ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100)))) | ||
154 | + | ||
155 | + # back to main flow | ||
156 | + coverage_20 = {} | ||
157 | + coverage_100 = {} | ||
158 | + for k in neighbor_size: | ||
159 | + coverage_20[k] = {} | ||
160 | + coverage_100[k] = {} | ||
161 | + with open(comment_20[k],'a') as f: | ||
162 | + for size in profile_size: | ||
163 | + coverage_20[k][size] = len(c_20[k][size])/float(repo_size) | ||
164 | + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" % | ||
165 | + (k,size,float(sum(p_20_summary[k][size]))/len(p_20_summary[k][size]),coverage_20[k][size])) | ||
166 | + with open(comment_100[k],'a') as f: | ||
167 | + for size in profile_size: | ||
168 | + coverage_100[k][size] = len(c_100[k][size])/float(repo_size) | ||
169 | + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" % | ||
170 | + (k,size,float(sum(f05_100_summary[k][size]))/len(f05_100_summary[k][size]),coverage_100[k][size])) | ||
171 | + | ||
172 | + for k in neighbor_size: | ||
173 | + # plot results summary | ||
174 | + g = Gnuplot.Gnuplot() | ||
175 | + g('set style data lines') | ||
176 | + g('set yrange [0:1.0]') | ||
177 | + g.xlabel('Profile size') | ||
178 | + g.title("Setup: %s-neighboorhood%3d (threshold 20)" % (cfg.strategy,k)) | ||
179 | + g.plot(Gnuplot.Data(sorted([[i,sum(p_20_summary[k][i])/len(p_20_summary[k][i])] | ||
180 | + for i in p_20_summary[k].keys()]),title="Precision"), | ||
181 | + Gnuplot.Data(sorted([[i,coverage_20[k][i]] | ||
182 | + for i in coverage_20[k].keys()]),title="Coverage")) | ||
183 | + g.hardcopy(graph_20[k],terminal="png") | ||
184 | + #commands.getoutput("convert -quality 100 %s %s" % | ||
185 | + # (graph_20[k],graph_20_jpg[k])) | ||
186 | + g = Gnuplot.Gnuplot() | ||
187 | + g('set style data lines') | ||
188 | + g('set yrange [0:1.0]') | ||
189 | + g.xlabel('Profile size') | ||
190 | + g.title("Setup: %s-neighboorhood%3d (threshold 100)" % (cfg.strategy,k)) | ||
191 | + g.plot(Gnuplot.Data(sorted([[i,sum(f05_100_summary[k][i])/len(f05_100_summary[k][i])] | ||
192 | + for i in f05_100_summary[k].keys()]),title="F05"), | ||
193 | + Gnuplot.Data(sorted([[i,coverage_100[k][i]] | ||
194 | + for i in coverage_100[k].keys()]),title="Coverage")) | ||
195 | + g.hardcopy(graph_100[k],terminal="png") | ||
196 | + #commands.getoutput("convert -quality 100 %s %s" % | ||
197 | + # (graph_100[k],graph_100_jpg[k])) |
@@ -0,0 +1,186 @@ | @@ -0,0 +1,186 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + k-suite - experiment different neighborhood sizes | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import sys | ||
23 | +sys.path.insert(0,'../') | ||
24 | +from config import Config | ||
25 | +from data import PopconXapianIndex, PopconSubmission | ||
26 | +from recommender import Recommender | ||
27 | +from user import LocalSystem, User | ||
28 | +from evaluation import * | ||
29 | +import logging | ||
30 | +import random | ||
31 | +import Gnuplot | ||
32 | +import numpy | ||
33 | + | ||
34 | +def plot_roc(k,roc_points,log_file): | ||
35 | + g = Gnuplot.Gnuplot() | ||
36 | + g('set style data points') | ||
37 | + g.xlabel('False Positive Rate') | ||
38 | + g.ylabel('True Positive Rate') | ||
39 | + g('set xrange [0:1.0]') | ||
40 | + g('set yrange [0:1.0]') | ||
41 | + g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k)) | ||
42 | + g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"), | ||
43 | + Gnuplot.Data(roc_points)) | ||
44 | + g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png") | ||
45 | + g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1) | ||
46 | + | ||
47 | +def plot_summary(precision,f05,mcc,log_file): | ||
48 | + g = Gnuplot.Gnuplot() | ||
49 | + g('set style data lines') | ||
50 | + g.xlabel('Neighborhood (k)') | ||
51 | + g.title("Setup: %s-size20" % (log_file.split("/")[-1])) | ||
52 | + g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"), | ||
53 | + Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"), | ||
54 | + Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC")) | ||
55 | + g.hardcopy(log_file+(".png"),terminal="png") | ||
56 | + g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1) | ||
57 | + | ||
58 | +class ExperimentResults: | ||
59 | + def __init__(self,repo_size): | ||
60 | + self.repository_size = repo_size | ||
61 | + self.precision = [] | ||
62 | + self.recall = [] | ||
63 | + self.fpr = [] | ||
64 | + self.f05 = [] | ||
65 | + self.mcc = [] | ||
66 | + | ||
67 | + def add_result(self,ranking,sample): | ||
68 | + predicted = RecommendationResult(dict.fromkeys(ranking,1)) | ||
69 | + real = RecommendationResult(sample) | ||
70 | + evaluation = Evaluation(predicted,real,self.repository_size) | ||
71 | + self.precision.append(evaluation.run(Precision())) | ||
72 | + self.recall.append(evaluation.run(Recall())) | ||
73 | + self.fpr.append(evaluation.run(FPR())) | ||
74 | + self.f05.append(evaluation.run(F_score(0.5))) | ||
75 | + self.mcc.append(evaluation.run(MCC())) | ||
76 | + | ||
77 | + def get_roc_point(self): | ||
78 | + tpr = self.recall | ||
79 | + fpr = self.fpr | ||
80 | + if not tpr or not fpr: | ||
81 | + return [0,0] | ||
82 | + return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)] | ||
83 | + | ||
84 | + def get_precision_summary(self): | ||
85 | + if not self.precision: return 0 | ||
86 | + return sum(self.precision)/len(self.precision) | ||
87 | + | ||
88 | + def get_f05_summary(self): | ||
89 | + if not self.f05: return 0 | ||
90 | + return sum(self.f05)/len(self.f05) | ||
91 | + | ||
92 | + def get_mcc_summary(self): | ||
93 | + if not self.mcc: return 0 | ||
94 | + return sum(self.mcc)/len(self.mcc) | ||
95 | + | ||
96 | +if __name__ == '__main__': | ||
97 | + if len(sys.argv)<3: | ||
98 | + print "Usage: k-suite strategy_str sample_file" | ||
99 | + exit(1) | ||
100 | + threshold = 20 | ||
101 | + iterations = 30 | ||
102 | + neighbors = [3,5,10,50,100,150,200,300,400,500] | ||
103 | + cfg = Config() | ||
104 | + cfg.strategy = sys.argv[1] | ||
105 | + sample_file = sys.argv[2] | ||
106 | + population_sample = [] | ||
107 | + with open(sample_file,'r') as f: | ||
108 | + for line in f.readlines(): | ||
109 | + user_id = line.strip('\n') | ||
110 | + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id)) | ||
111 | + # setup dictionaries and files | ||
112 | + roc_summary = {} | ||
113 | + recommended = {} | ||
114 | + precision_summary = {} | ||
115 | + f05_summary = {} | ||
116 | + mcc_summary = {} | ||
117 | + sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1]) | ||
118 | + if not os.path.exists(sample_dir): | ||
119 | + os.makedirs(sample_dir) | ||
120 | + log_file = os.path.join(sample_dir,cfg.strategy) | ||
121 | + with open(log_file,'w') as f: | ||
122 | + f.write("# %s\n\n" % sample_file.split('/')[-1]) | ||
123 | + f.write("# strategy %s recommendation_size %d iterations %d\n\n" % | ||
124 | + (cfg.strategy,threshold,iterations)) | ||
125 | + f.write("# k coverage \tprecision \tf05 \tmcc\n\n") | ||
126 | + | ||
127 | + for k in neighbors: | ||
128 | + roc_summary[k] = [] | ||
129 | + recommended[k] = set() | ||
130 | + precision_summary[k] = [] | ||
131 | + f05_summary[k] = [] | ||
132 | + mcc_summary[k] = [] | ||
133 | + with open(log_file+"-k%.3d"%k,'w') as f: | ||
134 | + f.write("# %s\n\n" % sample_file.split('/')[-1]) | ||
135 | + f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k)) | ||
136 | + f.write("# roc_point \tprecision \tf05 \tmcc\n\n") | ||
137 | + | ||
138 | + # main loop per user | ||
139 | + for submission_file in population_sample: | ||
140 | + user = PopconSystem(submission_file) | ||
141 | + user.filter_pkg_profile(cfg.pkgs_filter) | ||
142 | + user.maximal_pkg_profile() | ||
143 | + for k in neighbors: | ||
144 | + cfg.k_neighbors = k | ||
145 | + rec = Recommender(cfg) | ||
146 | + repo_size = rec.items_repository.get_doccount() | ||
147 | + results = ExperimentResults(repo_size) | ||
148 | + # n iterations for same recommender and user | ||
149 | + for n in range(iterations): | ||
150 | + # Fill sample profile | ||
151 | + profile_len = len(user.pkg_profile) | ||
152 | + item_score = {} | ||
153 | + for pkg in user.pkg_profile: | ||
154 | + item_score[pkg] = user.item_score[pkg] | ||
155 | + sample = {} | ||
156 | + sample_size = int(profile_len*0.9) | ||
157 | + for i in range(sample_size): | ||
158 | + key = random.choice(item_score.keys()) | ||
159 | + sample[key] = item_score.pop(key) | ||
160 | + iteration_user = User(item_score) | ||
161 | + recommendation = rec.get_recommendation(iteration_user,threshold) | ||
162 | + if hasattr(recommendation,"ranking"): | ||
163 | + results.add_result(recommendation.ranking,sample) | ||
164 | + recommended[k] = recommended[k].union(recommendation.ranking) | ||
165 | + # save summary | ||
166 | + roc_point = results.get_roc_point() | ||
167 | + roc_summary[k].append(roc_point) | ||
168 | + precision = results.get_precision_summary() | ||
169 | + precision_summary[k].append(precision) | ||
170 | + f05 = results.get_f05_summary() | ||
171 | + f05_summary[k].append(f05) | ||
172 | + mcc = results.get_mcc_summary() | ||
173 | + mcc_summary[k].append(mcc) | ||
174 | + with open(log_file+"-k%.3d"%k,'a') as f: | ||
175 | + f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" % | ||
176 | + (roc_point[0],roc_point[1],precision,f05,mcc)) | ||
177 | + # back to main flow | ||
178 | + with open(log_file,'a') as f: | ||
179 | + plot_summary(precision_summary,f05_summary,mcc_summary,log_file) | ||
180 | + for k in neighbors: | ||
181 | + coverage = len(recommended[size])/float(repo_size) | ||
182 | + f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" % | ||
183 | + (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]), | ||
184 | + float(sum(f05_summary[k]))/len(f05_summary[k]), | ||
185 | + float(sum(mcc_summary[k]))/len(mcc_summary[k]))) | ||
186 | + plot_roc(k,roc_summary[k],log_file) |
@@ -0,0 +1,51 @@ | @@ -0,0 +1,51 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + recommender suite - recommender experiments suite | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import sys | ||
23 | +import os | ||
24 | +sys.path.insert(0,'../') | ||
25 | +from config import Config | ||
26 | +from data import PopconXapianIndex, PopconSubmission | ||
27 | +from recommender import Recommender | ||
28 | +from user import LocalSystem, User | ||
29 | +from evaluation import * | ||
30 | +import logging | ||
31 | +import random | ||
32 | +import Gnuplot | ||
33 | + | ||
34 | +if __name__ == '__main__': | ||
35 | + | ||
36 | + cfg = Config() | ||
37 | + cfg.index_mode = "recluster" | ||
38 | + logging.info("Starting clustering experiments") | ||
39 | + logging.info("Medoids: %d\t Max popcon:%d" % (cfg.k_medoids,cfg.max_popcon)) | ||
40 | + cfg.popcon_dir = os.path.expanduser("~/org/popcon.debian.org/popcon-mail/popcon-entries/") | ||
41 | + cfg.popcon_index = cfg.popcon_index+("_%dmedoids%dmax" % | ||
42 | + (cfg.k_medoids,cfg.max_popcon)) | ||
43 | + cfg.clusters_dir = cfg.clusters_dir+("_%dmedoids%dmax" % | ||
44 | + (cfg.k_medoids,cfg.max_popcon)) | ||
45 | + pxi = PopconXapianIndex(cfg) | ||
46 | + logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion) | ||
47 | + # Write clustering log | ||
48 | + output = open(("results/clustering/%dmedoids%dmax" % (cfg.k_medoids,cfg.max_popcon)),'w') | ||
49 | + output.write("# k_medoids\tmax_popcon\tdispersion\n") | ||
50 | + output.write("%d %f\n" % (cfg.k_medoids,cfg.max_popcon,pxi.cluster_dispersion)) | ||
51 | + output.close() |
@@ -0,0 +1,27 @@ | @@ -0,0 +1,27 @@ | ||
1 | +[DEFAULT] | ||
2 | +repetitions = 1 | ||
3 | +iterations = 10 | ||
4 | +path = 'results' | ||
5 | +experiment = 'grid' | ||
6 | +weight = ['bm25', 'trad'] | ||
7 | +;profile_size = range(10,100,10) | ||
8 | +;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] | ||
9 | +sample = [0.6, 0.7, 0.8, 0.9] | ||
10 | + | ||
11 | +[content] | ||
12 | +strategy = ['cb','cbt','cbd'] | ||
13 | + | ||
14 | +[clustering] | ||
15 | +experiment = 'single' | ||
16 | +;iterations = 4 | ||
17 | +;medoids = range(2,6) | ||
18 | +iterations = 6 | ||
19 | +medoids = [100,500,1000,5000,10000,50000] | ||
20 | +;disabled for this experiment | ||
21 | +weight = 0 | ||
22 | +profile_size = 0 | ||
23 | +sample = 0 | ||
24 | + | ||
25 | +[colaborative] | ||
26 | +users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"] | ||
27 | +neighbors = range(10,1010,50) |
@@ -0,0 +1,171 @@ | @@ -0,0 +1,171 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + recommender suite - recommender experiments suite | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import expsuite | ||
23 | +import sys | ||
24 | +sys.path.insert(0,'../') | ||
25 | +from config import Config | ||
26 | +from data import PopconXapianIndex, PopconSubmission | ||
27 | +from recommender import Recommender | ||
28 | +from user import LocalSystem, User | ||
29 | +from evaluation import * | ||
30 | +import logging | ||
31 | +import random | ||
32 | +import Gnuplot | ||
33 | + | ||
34 | +class ClusteringSuite(expsuite.PyExperimentSuite): | ||
35 | + def reset(self, params, rep): | ||
36 | + self.cfg = Config() | ||
37 | + self.cfg.popcon_index = "../tests/test_data/.sample_pxi" | ||
38 | + self.cfg.popcon_dir = "../tests/test_data/popcon_dir" | ||
39 | + self.cfg.clusters_dir = "../tests/test_data/clusters_dir" | ||
40 | + | ||
41 | + if params['name'] == "clustering": | ||
42 | + logging.info("Starting 'clustering' experiments suite...") | ||
43 | + self.cfg.index_mode = "recluster" | ||
44 | + | ||
45 | + def iterate(self, params, rep, n): | ||
46 | + if params['name'] == "clustering": | ||
47 | + logging.info("Running iteration %d" % params['medoids'][n]) | ||
48 | + self.cfg.k_medoids = params['medoids'][n] | ||
49 | + pxi = PopconXapianIndex(self.cfg) | ||
50 | + result = {'k_medoids': params['medoids'][n], | ||
51 | + 'dispersion': pxi.cluster_dispersion} | ||
52 | + else: | ||
53 | + result = {} | ||
54 | + return result | ||
55 | + | ||
56 | +class ContentBasedSuite(expsuite.PyExperimentSuite): | ||
57 | + def reset(self, params, rep): | ||
58 | + if params['name'].startswith("content"): | ||
59 | + cfg = Config() | ||
60 | + #if the index was not built yet | ||
61 | + #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi") | ||
62 | + cfg.axi = "data/AppAxi" | ||
63 | + cfg.index_mode = "old" | ||
64 | + cfg.weight = params['weight'] | ||
65 | + self.rec = Recommender(cfg) | ||
66 | + self.rec.set_strategy(params['strategy']) | ||
67 | + self.repo_size = self.rec.items_repository.get_doccount() | ||
68 | + self.user = LocalSystem() | ||
69 | + self.user.app_pkg_profile(self.rec.items_repository) | ||
70 | + self.user.no_auto_pkg_profile() | ||
71 | + self.sample_size = int(len(self.user.pkg_profile)*params['sample']) | ||
72 | + # iteration should be set to 10 in config file | ||
73 | + #self.profile_size = range(10,101,10) | ||
74 | + | ||
75 | + def iterate(self, params, rep, n): | ||
76 | + if params['name'].startswith("content"): | ||
77 | + item_score = dict.fromkeys(self.user.pkg_profile,1) | ||
78 | + # Prepare partition | ||
79 | + sample = {} | ||
80 | + for i in range(self.sample_size): | ||
81 | + key = random.choice(item_score.keys()) | ||
82 | + sample[key] = item_score.pop(key) | ||
83 | + # Get full recommendation | ||
84 | + user = User(item_score) | ||
85 | + recommendation = self.rec.get_recommendation(user,self.repo_size) | ||
86 | + # Write recall log | ||
87 | + recall_file = "results/content/recall/%s-%s-%.2f-%d" % \ | ||
88 | + (params['strategy'],params['weight'],params['sample'],n) | ||
89 | + output = open(recall_file,'w') | ||
90 | + output.write("# weight=%s\n" % params['weight']) | ||
91 | + output.write("# strategy=%s\n" % params['strategy']) | ||
92 | + output.write("# sample=%f\n" % params['sample']) | ||
93 | + output.write("\n%d %d %d\n" % \ | ||
94 | + (self.repo_size,len(item_score),self.sample_size)) | ||
95 | + notfound = [] | ||
96 | + ranks = [] | ||
97 | + for pkg in sample.keys(): | ||
98 | + if pkg in recommendation.ranking: | ||
99 | + ranks.append(recommendation.ranking.index(pkg)) | ||
100 | + else: | ||
101 | + notfound.append(pkg) | ||
102 | + for r in sorted(ranks): | ||
103 | + output.write(str(r)+"\n") | ||
104 | + if notfound: | ||
105 | + output.write("Out of recommendation:\n") | ||
106 | + for pkg in notfound: | ||
107 | + output.write(pkg+"\n") | ||
108 | + output.close() | ||
109 | + # Plot metrics summary | ||
110 | + accuracy = [] | ||
111 | + precision = [] | ||
112 | + recall = [] | ||
113 | + f1 = [] | ||
114 | + g = Gnuplot.Gnuplot() | ||
115 | + g('set style data lines') | ||
116 | + g.xlabel('Recommendation size') | ||
117 | + for size in range(1,len(recommendation.ranking)+1,100): | ||
118 | + predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1)) | ||
119 | + real = RecommendationResult(sample) | ||
120 | + evaluation = Evaluation(predicted,real,self.repo_size) | ||
121 | + accuracy.append([size,evaluation.run(Accuracy())]) | ||
122 | + precision.append([size,evaluation.run(Precision())]) | ||
123 | + recall.append([size,evaluation.run(Recall())]) | ||
124 | + f1.append([size,evaluation.run(F1())]) | ||
125 | + g.plot(Gnuplot.Data(accuracy,title="Accuracy"), | ||
126 | + Gnuplot.Data(precision,title="Precision"), | ||
127 | + Gnuplot.Data(recall,title="Recall"), | ||
128 | + Gnuplot.Data(f1,title="F1")) | ||
129 | + g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1) | ||
130 | + # Iteration log | ||
131 | + result = {'iteration': n, | ||
132 | + 'weight': params['weight'], | ||
133 | + 'strategy': params['strategy'], | ||
134 | + 'accuracy': accuracy[20], | ||
135 | + 'precision': precision[20], | ||
136 | + 'recall:': recall[20], | ||
137 | + 'f1': f1[20]} | ||
138 | + return result | ||
139 | + | ||
140 | +#class CollaborativeSuite(expsuite.PyExperimentSuite): | ||
141 | +# def reset(self, params, rep): | ||
142 | +# if params['name'].startswith("collaborative"): | ||
143 | +# | ||
144 | +# def iterate(self, params, rep, n): | ||
145 | +# if params['name'].startswith("collaborative"): | ||
146 | +# for root, dirs, files in os.walk(self.source_dir): | ||
147 | +# for popcon_file in files: | ||
148 | +# submission = PopconSubmission(os.path.join(root,popcon_file)) | ||
149 | +# user = User(submission.packages) | ||
150 | +# user.maximal_pkg_profile() | ||
151 | +# rec.get_recommendation(user) | ||
152 | +# precision = 0 | ||
153 | +# result = {'weight': params['weight'], | ||
154 | +# 'strategy': params['strategy'], | ||
155 | +# 'profile_size': self.profile_size[n], | ||
156 | +# 'accuracy': accuracy, | ||
157 | +# 'precision': precision, | ||
158 | +# 'recall:': recall, | ||
159 | +# 'f1': } | ||
160 | +# else: | ||
161 | +# result = {} | ||
162 | +# return result | ||
163 | + | ||
164 | +if __name__ == '__main__': | ||
165 | + | ||
166 | + if "clustering" in sys.argv or len(sys.argv)<3: | ||
167 | + ClusteringSuite().start() | ||
168 | + if "content" in sys.argv or len(sys.argv)<3: | ||
169 | + ContentBasedSuite().start() | ||
170 | + #if "collaborative" in sys.argv or len(sys.argv)<3: | ||
171 | + #CollaborativeSuite().start() |
@@ -0,0 +1,74 @@ | @@ -0,0 +1,74 @@ | ||
1 | +#! /usr/bin/env python | ||
2 | +""" | ||
3 | + misc_popcon - misc experiments with popcon data | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import Gnuplot | ||
23 | +import xapian | ||
24 | +import os | ||
25 | +import random | ||
26 | +import sys | ||
27 | + | ||
28 | +def get_population_profile(popcon): | ||
29 | + profiles_size = [] | ||
30 | + for n in range(1,popcon.get_doccount()): | ||
31 | + user = popcon.get_document(n) | ||
32 | + pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")] | ||
33 | + if len(pkgs_profile)<10: | ||
34 | + print "-- profile<10:",user.get_data() | ||
35 | + profiles_size.append(len(pkgs_profile)) | ||
36 | + max_profile = max(profiles_size) | ||
37 | + population_profile = [(n,profiles_size.count(n)) | ||
38 | + for n in range(max_profile+1) | ||
39 | + if profiles_size.count(n)>0 ] | ||
40 | + return population_profile,max_profile | ||
41 | + | ||
42 | +def get_profile_ranges(population_profile,max_profile,popcon_size): | ||
43 | + ranges = range(0,251,50) | ||
44 | + ranges.append(max_profile) | ||
45 | + ranges_population = [] | ||
46 | + ranges_percentage = [] | ||
47 | + for maximum in ranges[1:]: | ||
48 | + minimum = ranges[ranges.index(maximum)-1] | ||
49 | + valid = [x[1] for x in population_profile | ||
50 | + if x[0]>minimum and x[0]<=maximum] | ||
51 | + ranges_population.append((maximum,sum(valid))) | ||
52 | + ranges_percentage.append((maximum,sum(valid)/float(popcon_size))) | ||
53 | + return ranges_population,ranges_percentage | ||
54 | + | ||
55 | +def plot(data,xlabel,ylabel,output): | ||
56 | + g = Gnuplot.Gnuplot() | ||
57 | + g('set style data points') | ||
58 | + g.xlabel(xlabel) | ||
59 | + g.ylabel(ylabel) | ||
60 | + g.plot(data) | ||
61 | + g.hardcopy(output+".png", terminal="png") | ||
62 | + g.hardcopy(output+".ps", terminal="postscript", enhanced=1, color=1) | ||
63 | + | ||
64 | +if __name__ == '__main__': | ||
65 | + popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps")) | ||
66 | + print ("Popcon repository size: %d" % popcon.get_doccount()) | ||
67 | + | ||
68 | + profile_population,max_profile = get_population_profile(popcon) | ||
69 | + ranges_population,ranges_percentage = get_profile_ranges(profile_population, | ||
70 | + max_profile,popcon.get_doccount()) | ||
71 | + print "Population per profile range (up to index)" | ||
72 | + print ranges_population | ||
73 | + plot(profile_population,"Desktop profile size","Population size", | ||
74 | + "results/misc-popcon/profile_population") |
@@ -0,0 +1,199 @@ | @@ -0,0 +1,199 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + profile-suite - experiment different profile sizes | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import sys | ||
23 | +sys.path.insert(0,'../') | ||
24 | +from config import Config | ||
25 | +from data import PopconXapianIndex, PopconSubmission | ||
26 | +from recommender import Recommender | ||
27 | +from user import LocalSystem, User | ||
28 | +from evaluation import * | ||
29 | +import logging | ||
30 | +import random | ||
31 | +import Gnuplot | ||
32 | +import numpy | ||
33 | + | ||
34 | +if __name__ == '__main__': | ||
35 | + if len(sys.argv)<2: | ||
36 | + print "Usage: profile-suite strategy_category sample_file" | ||
37 | + exit(1) | ||
38 | + | ||
39 | + iterations = 20 | ||
40 | + profile_size = [10,20,40,70,100,140,170,200,240] | ||
41 | + neighbor_size = [3,5,10,50,100,150,200,300,400,500] | ||
42 | + | ||
43 | + content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset'] | ||
44 | + collaborative_strategies = ['knn_eset']#,'knn_eset','knn_plus'] | ||
45 | + #collaborative_strategies = ['knn','knn_eset','knn_plus'] | ||
46 | + | ||
47 | + #iterations = 1 | ||
48 | + #profile_size = [10,20,30] | ||
49 | + #neighbor_size = [10,20,30] | ||
50 | + #content_strategies = ['cb'] | ||
51 | + #collaborative_strategies = ['knn_eset'] | ||
52 | + | ||
53 | + strategy_category = sys.argv[1] | ||
54 | + if strategy_category == "content": | ||
55 | + strategies = content_strategies | ||
56 | + sizes = profile_size | ||
57 | + option_str = "profile" | ||
58 | + elif strategy_category == "collaborative": | ||
59 | + strategies = collaborative_strategies | ||
60 | + sizes = neighbor_size | ||
61 | + option_str = "neighborhood" | ||
62 | + else: | ||
63 | + print "Usage: profile-suite strategy_category sample_file" | ||
64 | + exit(1) | ||
65 | + | ||
66 | + cfg = Config() | ||
67 | + population_sample = [] | ||
68 | + sample_file = sys.argv[2] | ||
69 | + sample_str = sample_file.split('/')[-1] | ||
70 | + with open(sample_file,'r') as f: | ||
71 | + for line in f.readlines(): | ||
72 | + user_id = line.strip('\n') | ||
73 | + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id)) | ||
74 | + sample_dir = ("results/%s/%s" % | ||
75 | + (strategy_category,sample_str)) | ||
76 | + if not os.path.exists(sample_dir): | ||
77 | + os.makedirs(sample_dir) | ||
78 | + | ||
79 | + for strategy in strategies: | ||
80 | + cfg.strategy = strategy | ||
81 | + p_20_summary = {} | ||
82 | + f05_100_summary = {} | ||
83 | + c_20 = {} | ||
84 | + c_100 = {} | ||
85 | + | ||
86 | + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy) | ||
87 | + graph_20 = log_file+"-20.png" | ||
88 | + graph_100 = log_file+"-100.png" | ||
89 | + graph_20_jpg = graph_20.strip(".png")+".jpg" | ||
90 | + graph_100_jpg = graph_100.strip(".png")+".jpg" | ||
91 | + comment_20 = graph_20_jpg+".comment" | ||
92 | + comment_100 = graph_100_jpg+".comment" | ||
93 | + | ||
94 | + with open(comment_20,'w') as f: | ||
95 | + f.write("# sample %s\n" % sample_str) | ||
96 | + f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" % | ||
97 | + (cfg.strategy,iterations)) | ||
98 | + f.write("# %s\tp_20\tc_20\n\n"%option_str) | ||
99 | + with open(comment_100,'w') as f: | ||
100 | + f.write("# sample %s\n" % sample_str) | ||
101 | + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" % | ||
102 | + (cfg.strategy,iterations)) | ||
103 | + f.write("# %s\t\tf05_100\t\tc_100\n\n"%option_str) | ||
104 | + | ||
105 | + for size in sizes: | ||
106 | + c_20[size] = set() | ||
107 | + c_100[size] = set() | ||
108 | + p_20_summary[size] = [] | ||
109 | + f05_100_summary[size] = [] | ||
110 | + with open(log_file+"-%s%.3d"%(option_str,size),'w') as f: | ||
111 | + f.write("# sample %s\n" % sample_str) | ||
112 | + f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size)) | ||
113 | + f.write("# p_20\tf05_100\n\n") | ||
114 | + | ||
115 | + # main loop per user | ||
116 | + for submission_file in population_sample: | ||
117 | + user = PopconSystem(submission_file) | ||
118 | + user.filter_pkg_profile(cfg.pkgs_filter) | ||
119 | + user.maximal_pkg_profile() | ||
120 | + for size in sizes: | ||
121 | + cfg.profile_size = size | ||
122 | + cfg.k_neighbors = size | ||
123 | + rec = Recommender(cfg) | ||
124 | + repo_size = rec.items_repository.get_doccount() | ||
125 | + p_20 = [] | ||
126 | + f05_100 = [] | ||
127 | + for n in range(iterations): | ||
128 | + # Fill sample profile | ||
129 | + profile_len = len(user.pkg_profile) | ||
130 | + item_score = {} | ||
131 | + for pkg in user.pkg_profile: | ||
132 | + item_score[pkg] = user.item_score[pkg] | ||
133 | + sample = {} | ||
134 | + sample_size = int(profile_len*0.9) | ||
135 | + for i in range(sample_size): | ||
136 | + key = random.choice(item_score.keys()) | ||
137 | + sample[key] = item_score.pop(key) | ||
138 | + iteration_user = User(item_score) | ||
139 | + recommendation = rec.get_recommendation(iteration_user,repo_size) | ||
140 | + if hasattr(recommendation,"ranking"): | ||
141 | + ranking = recommendation.ranking | ||
142 | + real = RecommendationResult(sample) | ||
143 | + predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1)) | ||
144 | + evaluation = Evaluation(predicted_20,real,repo_size) | ||
145 | + p_20.append(evaluation.run(Precision())) | ||
146 | + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1)) | ||
147 | + evaluation = Evaluation(predicted_100,real,repo_size) | ||
148 | + f05_100.append(evaluation.run(F_score(0.5))) | ||
149 | + c_20[size] = c_20[size].union(recommendation.ranking[:20]) | ||
150 | + c_100[size] = c_100[size].union(recommendation.ranking[:100]) | ||
151 | + # save summary | ||
152 | + if p_20: | ||
153 | + p_20_summary[size].append(sum(p_20)/len(p_20)) | ||
154 | + if f05_100: | ||
155 | + f05_100_summary[size].append(sum(f05_100)/len(f05_100)) | ||
156 | + | ||
157 | + with open(log_file+"-%s%.3d"%(option_str,size),'a') as f: | ||
158 | + f.write("%.4f \t%.4f\n" % | ||
159 | + ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100)))) | ||
160 | + | ||
161 | + # back to main flow | ||
162 | + coverage_20 = {} | ||
163 | + coverage_100 = {} | ||
164 | + with open(comment_20,'a') as f: | ||
165 | + for size in sizes: | ||
166 | + coverage_20[size] = len(c_20[size])/float(repo_size) | ||
167 | + f.write("%3d\t\t%.4f\t\t%.4f\n" % | ||
168 | + (size,float(sum(p_20_summary[size]))/len(p_20_summary[size]),coverage_20[size])) | ||
169 | + with open(comment_100,'a') as f: | ||
170 | + for size in sizes: | ||
171 | + coverage_100[size] = len(c_100[size])/float(repo_size) | ||
172 | + f.write("%3d\t\t%.4f\t\t%.4f\n" % | ||
173 | + (size,float(sum(f05_100_summary[size]))/len(f05_100_summary[size]),coverage_100[size])) | ||
174 | + | ||
175 | + # plot results summary | ||
176 | + g = Gnuplot.Gnuplot() | ||
177 | + g('set style data lines') | ||
178 | + g('set yrange [0:1.0]') | ||
179 | + g.xlabel('%s size'%option_str.capitalize()) | ||
180 | + g.title("Setup: %s (threshold 20)" % cfg.strategy) | ||
181 | + g.plot(Gnuplot.Data(sorted([[k,sum(p_20_summary[k])/len(p_20_summary[k])] | ||
182 | + for k in p_20_summary.keys()]),title="Precision"), | ||
183 | + Gnuplot.Data(sorted([[k,coverage_20[k]] | ||
184 | + for k in coverage_20.keys()]),title="Coverage")) | ||
185 | + g.hardcopy(graph_20,terminal="png") | ||
186 | + commands.getoutput("convert -quality 20 %s %s" % | ||
187 | + (graph_100,graph_20_jpg)) | ||
188 | + g = Gnuplot.Gnuplot() | ||
189 | + g('set style data lines') | ||
190 | + g('set yrange [0:1.0]') | ||
191 | + g.xlabel('%s size'%option_str.capitalize()) | ||
192 | + g.title("Setup: %s (threshold 100)" % cfg.strategy) | ||
193 | + g.plot(Gnuplot.Data(sorted([[k,sum(f05_100_summary[k])/len(f05_100_summary[k])] | ||
194 | + for k in f05_100_summary.keys()]),title="F05"), | ||
195 | + Gnuplot.Data(sorted([[k,coverage_100[k]] | ||
196 | + for k in coverage_100.keys()]),title="Coverage")) | ||
197 | + g.hardcopy(graph_100,terminal="png") | ||
198 | + commands.getoutput("convert -quality 100 %s %s" % | ||
199 | + (graph_100,graph_100_jpg)) |
@@ -0,0 +1,231 @@ | @@ -0,0 +1,231 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + recommender suite - recommender experiments suite | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import sys | ||
23 | +sys.path.insert(0,'../') | ||
24 | +from config import Config | ||
25 | +from data import PopconXapianIndex, PopconSubmission | ||
26 | +from recommender import Recommender | ||
27 | +from user import LocalSystem, User | ||
28 | +from evaluation import * | ||
29 | +import logging | ||
30 | +import random | ||
31 | +import Gnuplot | ||
32 | +import numpy | ||
33 | + | ||
34 | +#iterations = 3 | ||
35 | +#sample_proportions = [0.9] | ||
36 | +#weighting = [('bm25',1.2)] | ||
37 | +#collaborative = ['knn_eset'] | ||
38 | +#content_based = ['cb'] | ||
39 | +#hybrid = ['knnco'] | ||
40 | +#profile_size = [50,100] | ||
41 | +#popcon_size = ["1000"] | ||
42 | +#neighbors = [50] | ||
43 | + | ||
44 | +iterations = 30 | ||
45 | +sample_proportions = [0.9] | ||
46 | +weighting = [('bm25',1.0)] | ||
47 | +content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset'] | ||
48 | +collaborative = ['knn_eset','knn','knn_plus'] | ||
49 | +hybrid = ['knnco','knnco_eset'] | ||
50 | +profile_size = range(20,200,40) | ||
51 | +neighbors = range(10,510,50) | ||
52 | + | ||
53 | +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file): | ||
54 | + # Write recall log | ||
55 | + output = open(("%s-%.2d" % (log_file,n)),'w') | ||
56 | + output.write("# %s-n\n" % label["description"]) | ||
57 | + output.write("# %s-%.2d\n" % (label["values"],n)) | ||
58 | + output.write("\n# repository profile sample\n%d %d %d\n" % \ | ||
59 | + (repo_size,profile_size,len(sample))) | ||
60 | + if hasattr(recommendation,"ranking"): | ||
61 | + notfound = [] | ||
62 | + ranks = [] | ||
63 | + for pkg in sample.keys(): | ||
64 | + if pkg in recommendation.ranking: | ||
65 | + ranks.append(recommendation.ranking.index(pkg)) | ||
66 | + else: | ||
67 | + notfound.append(pkg) | ||
68 | + for r in sorted(ranks): | ||
69 | + output.write(str(r)+"\n") | ||
70 | + if notfound: | ||
71 | + output.write("# out of recommendation:\n") | ||
72 | + for pkg in notfound: | ||
73 | + output.write(pkg+"\n") | ||
74 | + output.close() | ||
75 | + | ||
76 | +def plot_roc(roc_points,eauc,c,p,log_file): | ||
77 | + g = Gnuplot.Gnuplot() | ||
78 | + g('set style data lines') | ||
79 | + g.xlabel('False Positive Rate') | ||
80 | + g.ylabel('True Positive Rate') | ||
81 | + g('set xrange [0:1.0]') | ||
82 | + g('set yrange [0:1.0]') | ||
83 | + g.title("Setup: %s" % log_file.split("/")[-1]) | ||
84 | + g('set label "C %.2f" at 0.8,0.25' % c) | ||
85 | + g('set label "P(20) %.2f" at 0.8,0.2' % p) | ||
86 | + g('set label "AUC %.4f" at 0.8,0.15' % eauc) | ||
87 | + g.plot(Gnuplot.Data(roc_points,title="ROC"), | ||
88 | + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7")) | ||
89 | + #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6")) | ||
90 | + g.hardcopy(log_file+"-roc.png",terminal="png") | ||
91 | + g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1) | ||
92 | + | ||
93 | +def get_label(cfg,sample_proportion): | ||
94 | + label = {} | ||
95 | + if cfg.strategy in content_based: | ||
96 | + label["description"] = "strategy-profile" | ||
97 | + label["values"] = ("%s-profile%.3d" % | ||
98 | + (cfg.strategy,cfg.profile_size)) | ||
99 | + elif cfg.strategy in collaborative: | ||
100 | + label["description"] = "strategy-knn" | ||
101 | + label["values"] = ("%s-k%.3d" % | ||
102 | + (cfg.strategy,cfg.k_neighbors)) | ||
103 | + elif cfg.strategy in hybrid: | ||
104 | + label["description"] = "strategy-knn-profile" | ||
105 | + label["values"] = ("%s-k%.3d-profile%.3d" % | ||
106 | + (cfg.strategy,cfg.k_neighbors,cfg.profile_size)) | ||
107 | + else: | ||
108 | + print "Unknown strategy" | ||
109 | + return label | ||
110 | + | ||
111 | +class ExperimentResults: | ||
112 | + def __init__(self,repo_size): | ||
113 | + self.repository_size = repo_size | ||
114 | + self.precision = {} | ||
115 | + self.recall = {} | ||
116 | + self.fpr = {} | ||
117 | + points = [1]+range(10,self.repository_size,10) | ||
118 | + self.recommended = set() | ||
119 | + for size in points: | ||
120 | + self.precision[size] = [] | ||
121 | + self.recall[size] = [] | ||
122 | + self.fpr[size] = [] | ||
123 | + | ||
124 | + def add_result(self,ranking,sample): | ||
125 | + self.recommended = self.recommended.union(ranking) | ||
126 | + # get data only for point | ||
127 | + for size in self.precision.keys(): | ||
128 | + predicted = RecommendationResult(dict.fromkeys(ranking[:size],1)) | ||
129 | + real = RecommendationResult(sample) | ||
130 | + evaluation = Evaluation(predicted,real,self.repository_size) | ||
131 | + self.precision[size].append(evaluation.run(Precision())) | ||
132 | + self.recall[size].append(evaluation.run(Recall())) | ||
133 | + self.fpr[size].append(evaluation.run(FPR())) | ||
134 | + | ||
135 | + # Average ROC by threshold (= size of recommendation) | ||
136 | + def get_roc_points(self): | ||
137 | + points = [] | ||
138 | + for size in self.recall.keys(): | ||
139 | + tpr = self.recall[size] | ||
140 | + fpr = self.fpr[size] | ||
141 | + points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)]) | ||
142 | + return sorted(points) | ||
143 | + | ||
144 | +def run_strategy(cfg,user): | ||
145 | + for weight in weighting: | ||
146 | + cfg.weight = weight[0] | ||
147 | + cfg.bm25_k1 = weight[1] | ||
148 | + rec = Recommender(cfg) | ||
149 | + repo_size = rec.items_repository.get_doccount() | ||
150 | + for proportion in sample_proportions: | ||
151 | + results = ExperimentResults(repo_size) | ||
152 | + label = get_label(cfg,proportion) | ||
153 | + user_dir = ("results/roc-suite/%s" % user.user_id[:8]) | ||
154 | + if not os.path.exists(user_dir): | ||
155 | + os.mkdir(user_dir) | ||
156 | + log_file = os.path.join(user_dir,label["values"]) | ||
157 | + for n in range(iterations): | ||
158 | + # Fill sample profile | ||
159 | + profile_len = len(user.pkg_profile) | ||
160 | + item_score = {} | ||
161 | + for pkg in user.pkg_profile: | ||
162 | + item_score[pkg] = user.item_score[pkg] | ||
163 | + sample = {} | ||
164 | + sample_size = int(profile_len*proportion) | ||
165 | + for i in range(sample_size): | ||
166 | + key = random.choice(item_score.keys()) | ||
167 | + sample[key] = item_score.pop(key) | ||
168 | + iteration_user = User(item_score) | ||
169 | + recommendation = rec.get_recommendation(iteration_user,repo_size) | ||
170 | + write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file) | ||
171 | + if hasattr(recommendation,"ranking"): | ||
172 | + results.add_result(recommendation.ranking,sample) | ||
173 | + with open(log_file,'w') as f: | ||
174 | + roc_points = results.get_roc_points() | ||
175 | + x_coord = [p[0] for p in roc_points] | ||
176 | + y_coord = [p[1] for p in roc_points] | ||
177 | + auc = numpy.trapz(y=y_coord, x=x_coord) | ||
178 | + eauc = (auc+ | ||
179 | + numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+ | ||
180 | + numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1])) | ||
181 | + precision_20 = sum(results.precision[10])/len(results.precision[10]) | ||
182 | + coverage = len(results.recommended)/float(repo_size) | ||
183 | + f.write("# %s\n# %s\n\n" % | ||
184 | + (label["description"],label["values"])) | ||
185 | + f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" % | ||
186 | + (coverage,precision_20,auc,eauc)) | ||
187 | + plot_roc(roc_points,eauc,coverage,precision_20,log_file) | ||
188 | + | ||
189 | +def run_content(user,cfg): | ||
190 | + for strategy in content_based: | ||
191 | + cfg.strategy = strategy | ||
192 | + for size in profile_size: | ||
193 | + cfg.profile_size = size | ||
194 | + run_strategy(cfg,user) | ||
195 | + | ||
196 | +def run_collaborative(user,cfg): | ||
197 | + popcon_desktopapps = cfg.popcon_desktopapps | ||
198 | + popcon_programs = cfg.popcon_programs | ||
199 | + for strategy in collaborative: | ||
200 | + cfg.strategy = strategy | ||
201 | + for k in neighbors: | ||
202 | + cfg.k_neighbors = k | ||
203 | + run_strategy(cfg,user) | ||
204 | + | ||
205 | +def run_hybrid(user,cfg): | ||
206 | + popcon_desktopapps = cfg.popcon_desktopapps | ||
207 | + popcon_programs = cfg.popcon_programs | ||
208 | + for strategy in hybrid: | ||
209 | + cfg.strategy = strategy | ||
210 | + for k in neighbors: | ||
211 | + cfg.k_neighbors = k | ||
212 | + for size in profile_size: | ||
213 | + cfg.profile_size = size | ||
214 | + run_strategy(cfg,user) | ||
215 | + | ||
216 | +if __name__ == '__main__': | ||
217 | + if len(sys.argv)<2: | ||
218 | + print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]" | ||
219 | + exit(1) | ||
220 | + | ||
221 | + cfg = Config() | ||
222 | + user = PopconSystem(sys.argv[1]) | ||
223 | + user.filter_pkg_profile(cfg.pkgs_filter) | ||
224 | + user.maximal_pkg_profile() | ||
225 | + | ||
226 | + if "content" in sys.argv or len(sys.argv)<3: | ||
227 | + run_content(user,cfg) | ||
228 | + if "collaborative" in sys.argv or len(sys.argv)<3: | ||
229 | + run_collaborative(user,cfg) | ||
230 | + if "hybrid" in sys.argv or len(sys.argv)<3: | ||
231 | + run_hybrid(user,cfg) |
@@ -0,0 +1,44 @@ | @@ -0,0 +1,44 @@ | ||
1 | +#! /usr/bin/env python | ||
2 | +""" | ||
3 | + sample-popcon-arch - extract a sample of a specific arch | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | +import sys | ||
22 | +sys.path.insert(0,'../') | ||
23 | +import xapian | ||
24 | +import os | ||
25 | +import random | ||
26 | +import sys | ||
27 | +from user import RandomPopcon | ||
28 | + | ||
29 | +if __name__ == '__main__': | ||
30 | + try: | ||
31 | + size = int(sys.argv[1]) | ||
32 | + arch = sys.argv[2] | ||
33 | + popcon_dir = sys.argv[3] | ||
34 | + pkgs_filter = sys.argv[4] | ||
35 | + except: | ||
36 | + print "Usage: sample-popcon-arch size arch popcon_dir pkgs_filter" | ||
37 | + exit(1) | ||
38 | + | ||
39 | + sample_file = ("results/misc-popcon/sample-%s-%d" % (arch,size)) | ||
40 | + with open(sample_file,'w') as f: | ||
41 | + for n in range(1,size+1): | ||
42 | + user = RandomPopcon(popcon_dir,arch,pkgs_filter) | ||
43 | + f.write(user.user_id+'\n') | ||
44 | + print "sample",n |
@@ -0,0 +1,53 @@ | @@ -0,0 +1,53 @@ | ||
1 | +#! /usr/bin/env python | ||
2 | +""" | ||
3 | + sample-popcon - extract a sample from popcon population | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import xapian | ||
23 | +import os | ||
24 | +import random | ||
25 | +import sys | ||
26 | + | ||
27 | +def extract_sample(size,popcon,min_profile,max_profile,output): | ||
28 | + sample = [] | ||
29 | + for n in range(1,popcon.get_doccount()+1): | ||
30 | + user = popcon.get_document(n) | ||
31 | + pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")] | ||
32 | + print len(pkgs_profile) | ||
33 | + if len(pkgs_profile)>min_profile and len(pkgs_profile)<=max_profile: | ||
34 | + sample.append(user.get_data()) | ||
35 | + print n,len(sample) | ||
36 | + if len(sample)==size: | ||
37 | + break | ||
38 | + with open(("%s-%d-%d"%(output,min_profile,max_profile)),'w') as f: | ||
39 | + for s in sample: | ||
40 | + f.write(s+'\n') | ||
41 | + | ||
42 | +if __name__ == '__main__': | ||
43 | + popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps")) | ||
44 | + print ("Popcon repository size: %d" % popcon.get_doccount()) | ||
45 | + try: | ||
46 | + min_profile = int(sys.argv[1]) | ||
47 | + max_profile = int(sys.argv[2]) | ||
48 | + size = int(sys.argv[3]) | ||
49 | + except: | ||
50 | + print "Usage: sample-popcon min_profile max_profile sample_size" | ||
51 | + exit(1) | ||
52 | + sample_file = "results/misc-popcon/sample" | ||
53 | + extract_sample(size,popcon,min_profile,max_profile,sample_file) |
src/recommender.py
@@ -75,20 +75,20 @@ class Recommender: | @@ -75,20 +75,20 @@ class Recommender: | ||
75 | """ | 75 | """ |
76 | self.cfg = cfg | 76 | self.cfg = cfg |
77 | # Load xapian indexes | 77 | # Load xapian indexes |
78 | - self.axi_programs = xapian.Database(cfg.axi_programs) | 78 | + #self.axi_programs = xapian.Database(cfg.axi_programs) |
79 | self.axi_desktopapps = xapian.Database(cfg.axi_desktopapps) | 79 | self.axi_desktopapps = xapian.Database(cfg.axi_desktopapps) |
80 | if cfg.popcon: | 80 | if cfg.popcon: |
81 | - self.popcon_programs = xapian.Database(cfg.popcon_programs) | 81 | + #self.popcon_programs = xapian.Database(cfg.popcon_programs) |
82 | self.popcon_desktopapps = xapian.Database(cfg.popcon_desktopapps) | 82 | self.popcon_desktopapps = xapian.Database(cfg.popcon_desktopapps) |
83 | # Load valid programs, desktopapps and tags | 83 | # Load valid programs, desktopapps and tags |
84 | # format: one package or tag name per line | 84 | # format: one package or tag name per line |
85 | - self.valid_programs = [] | 85 | + #self.valid_programs = [] |
86 | self.valid_desktopapps = [] | 86 | self.valid_desktopapps = [] |
87 | self.valid_tags = [] | 87 | self.valid_tags = [] |
88 | logging.info("Loading recommender filters") | 88 | logging.info("Loading recommender filters") |
89 | - with open(os.path.join(cfg.filters_dir,"programs")) as pkgs: | ||
90 | - self.valid_programs = [line.strip() for line in pkgs | ||
91 | - if not line.startswith("#")] | 89 | + #with open(os.path.join(cfg.filters_dir,"programs")) as pkgs: |
90 | + # self.valid_programs = [line.strip() for line in pkgs | ||
91 | + # if not line.startswith("#")] | ||
92 | with open(os.path.join(cfg.filters_dir,"desktopapps")) as pkgs: | 92 | with open(os.path.join(cfg.filters_dir,"desktopapps")) as pkgs: |
93 | self.valid_desktopapps = [line.strip() for line in pkgs | 93 | self.valid_desktopapps = [line.strip() for line in pkgs |
94 | if not line.startswith("#")] | 94 | if not line.startswith("#")] |
@@ -109,19 +109,21 @@ class Recommender: | @@ -109,19 +109,21 @@ class Recommender: | ||
109 | Set the recommendation strategy. | 109 | Set the recommendation strategy. |
110 | """ | 110 | """ |
111 | logging.info("Setting recommender strategy to \'%s\'" % strategy_str) | 111 | logging.info("Setting recommender strategy to \'%s\'" % strategy_str) |
112 | - if self.cfg.pkgs_filter.split("/")[-1] == "desktopapps": | ||
113 | - self.items_repository = self.axi_desktopapps | ||
114 | - self.valid_pkgs = self.valid_desktopapps | ||
115 | - else: | ||
116 | - self.items_repository = self.axi_programs | ||
117 | - self.valid_pkgs = self.valid_programs | ||
118 | # Check if collaborative strategies can be instanciated | 112 | # Check if collaborative strategies can be instanciated |
119 | - if ("col" in strategy_str) or ("knn" in strategy_str): | 113 | + if "knn" in strategy_str: |
120 | if not self.cfg.popcon: | 114 | if not self.cfg.popcon: |
121 | logging.info("Cannot perform collaborative strategy") | 115 | logging.info("Cannot perform collaborative strategy") |
122 | return 1 | 116 | return 1 |
123 | - else: | ||
124 | - self.users_repository = self.popcon_programs | 117 | + #if self.cfg.pkgs_filter.split("/")[-1] == "desktopapps": |
118 | + self.items_repository = self.axi_desktopapps | ||
119 | + self.valid_pkgs = self.valid_desktopapps | ||
120 | + if "knn" in strategy_str: | ||
121 | + self.users_repository = self.popcon_desktopapps | ||
122 | + #else: | ||
123 | + # self.items_repository = self.axi_programs | ||
124 | + # self.valid_pkgs = self.valid_programs | ||
125 | + # if "knn" in strategy_str: | ||
126 | + # self.users_repository = self.popcon_programs | ||
125 | # Set strategy based on strategy_str | 127 | # Set strategy based on strategy_str |
126 | if strategy_str == "cb": | 128 | if strategy_str == "cb": |
127 | self.strategy = strategy.ContentBased("mix",self.cfg.profile_size) | 129 | self.strategy = strategy.ContentBased("mix",self.cfg.profile_size) |
@@ -151,8 +153,9 @@ class Recommender: | @@ -151,8 +153,9 @@ class Recommender: | ||
151 | self.strategy = strategy.KnnContent(self.cfg.k_neighbors) | 153 | self.strategy = strategy.KnnContent(self.cfg.k_neighbors) |
152 | elif strategy_str == "knnco_eset": | 154 | elif strategy_str == "knnco_eset": |
153 | self.strategy = strategy.KnnContentEset(self.cfg.k_neighbors) | 155 | self.strategy = strategy.KnnContentEset(self.cfg.k_neighbors) |
154 | - elif strategy_str.startswith("demo"): | ||
155 | - self.strategy = strategy.Demographic(strategy_str) | 156 | + # [FIXME: fix repository instanciation] |
157 | + #elif strategy_str.startswith("demo"): | ||
158 | + # self.strategy = strategy.Demographic(strategy_str) | ||
156 | else: | 159 | else: |
157 | logging.info("Strategy not defined.") | 160 | logging.info("Strategy not defined.") |
158 | return | 161 | return |
src/strategy.py
@@ -20,6 +20,7 @@ __license__ = """ | @@ -20,6 +20,7 @@ __license__ = """ | ||
20 | along with this program. If not, see <http://www.gnu.org/licenses/>. | 20 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
21 | """ | 21 | """ |
22 | 22 | ||
23 | +import os | ||
23 | import xapian | 24 | import xapian |
24 | from singleton import Singleton | 25 | from singleton import Singleton |
25 | import recommender | 26 | import recommender |
src/user.py
@@ -111,7 +111,7 @@ class User: | @@ -111,7 +111,7 @@ class User: | ||
111 | """ | 111 | """ |
112 | Define a user of a recommender. | 112 | Define a user of a recommender. |
113 | """ | 113 | """ |
114 | - def __init__(self,item_score,user_id=0,demo_profiles_set=0): | 114 | + def __init__(self,item_score,user_id=0,arch=0,demo_profiles_set=0): |
115 | """ | 115 | """ |
116 | Set initial user attributes. pkg_profile gets the whole set of items, | 116 | Set initial user attributes. pkg_profile gets the whole set of items, |
117 | a random user_id is set if none was provided and the demographic | 117 | a random user_id is set if none was provided and the demographic |
@@ -119,6 +119,7 @@ class User: | @@ -119,6 +119,7 @@ class User: | ||
119 | """ | 119 | """ |
120 | self.item_score = item_score | 120 | self.item_score = item_score |
121 | self.pkg_profile = self.items() | 121 | self.pkg_profile = self.items() |
122 | + self.arch = arch | ||
122 | 123 | ||
123 | if user_id: | 124 | if user_id: |
124 | self.user_id = user_id | 125 | self.user_id = user_id |
@@ -272,21 +273,28 @@ class User: | @@ -272,21 +273,28 @@ class User: | ||
272 | return self.pkg_profile | 273 | return self.pkg_profile |
273 | 274 | ||
274 | class RandomPopcon(User): | 275 | class RandomPopcon(User): |
275 | - def __init__(self,submissions_dir,pkgs_filter=0): | 276 | + def __init__(self,submissions_dir,arch=0,pkgs_filter=0): |
276 | """ | 277 | """ |
277 | Set initial parameters. | 278 | Set initial parameters. |
278 | """ | 279 | """ |
279 | len_profile = 0 | 280 | len_profile = 0 |
280 | - while len_profile < 100: | 281 | + match_arch = False |
282 | + while len_profile < 100 or not match_arch: | ||
281 | path = random.choice([os.path.join(root, submission) for | 283 | path = random.choice([os.path.join(root, submission) for |
282 | root, dirs, files in os.walk(submissions_dir) | 284 | root, dirs, files in os.walk(submissions_dir) |
283 | for submission in files]) | 285 | for submission in files]) |
284 | user = PopconSystem(path) | 286 | user = PopconSystem(path) |
287 | + print arch | ||
288 | + print user.arch | ||
289 | + if arch and user.arch==arch: | ||
290 | + match_arch = True | ||
291 | + print "match" | ||
285 | if pkgs_filter: | 292 | if pkgs_filter: |
286 | user.filter_pkg_profile(pkgs_filter) | 293 | user.filter_pkg_profile(pkgs_filter) |
287 | len_profile = len(user.pkg_profile) | 294 | len_profile = len(user.pkg_profile) |
295 | + print "p",len_profile | ||
288 | submission = data.PopconSubmission(path) | 296 | submission = data.PopconSubmission(path) |
289 | - User.__init__(self,submission.packages,submission.user_id) | 297 | + User.__init__(self,submission.packages,submission.user_id,submission.arch) |
290 | 298 | ||
291 | class PopconSystem(User): | 299 | class PopconSystem(User): |
292 | def __init__(self,path,user_id=0): | 300 | def __init__(self,path,user_id=0): |
@@ -296,7 +304,7 @@ class PopconSystem(User): | @@ -296,7 +304,7 @@ class PopconSystem(User): | ||
296 | submission = data.PopconSubmission(path) | 304 | submission = data.PopconSubmission(path) |
297 | if not user_id: | 305 | if not user_id: |
298 | user_id = submission.user_id | 306 | user_id = submission.user_id |
299 | - User.__init__(self,submission.packages,user_id) | 307 | + User.__init__(self,submission.packages,user_id,submission.arch) |
300 | 308 | ||
301 | class PkgsListSystem(User): | 309 | class PkgsListSystem(User): |
302 | def __init__(self,pkgs_list_or_file,user_id=0): | 310 | def __init__(self,pkgs_list_or_file,user_id=0): |
@@ -36,7 +36,7 @@ button below. | @@ -36,7 +36,7 @@ button below. | ||
36 | </div> | 36 | </div> |
37 | 37 | ||
38 | 38 | ||
39 | -<form action="/save" method="post" enctype="multipart/form-data" name="surveyform"> | 39 | +<form action="save" method="post" enctype="multipart/form-data" name="surveyform"> |
40 | 40 | ||
41 | <input type="hidden" name="user_id" value=$request.user.user_id> | 41 | <input type="hidden" name="user_id" value=$request.user.user_id> |
42 | <input type="hidden" name="strategy" value=$request.strategy> | 42 | <input type="hidden" name="strategy" value=$request.strategy> |