Commit 6d9bfe1d7e44ab36152b3b97fd12208d56f27dfb

Authored by Tássia Camões Araújo
2 parents b9ecf615 c673b9b2
Exists in master and in 1 other branch add_vagrant

Merge branch 'master' of github.com:tassia/AppRecommender

src/bin/cross_validation.py
... ... @@ -37,7 +37,7 @@ if __name__ == '__main__':
37 37 #user = LocalSystem()
38 38 #user = RandomPopcon(cfg.popcon_dir)
39 39 #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
40   - user = PopconSystem("/home/tassia/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
  40 + user = PopconSystem(os.path.expanduser("~/.app-recommender/popcon-entries/00/0001166d0737c6dffb083071e5ee69f5"))
41 41 user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps"))
42 42 user.maximal_pkg_profile()
43 43 begin_time = datetime.datetime.now()
... ... @@ -48,7 +48,7 @@ if __name__ == '__main__':
48 48 metrics.append(F_score(0.5))
49 49 metrics.append(Accuracy())
50 50 metrics.append(FPR())
51   - validation = CrossValidation(0.9,10,rec,metrics,1)
  51 + validation = CrossValidation(0.9,20,rec,metrics,0.005)
52 52 validation.run(user)
53 53 print validation
54 54  
... ...
src/bin/get_axipkgs.py 0 → 100755
... ... @@ -0,0 +1,42 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + AppRecommender - A GNU/Linux application recommender
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import os
  23 +import sys
  24 +sys.path.insert(0,'../')
  25 +import xapian
  26 +
  27 +if __name__ == '__main__':
  28 + if len(sys.argv)<2:
  29 + print "Usage: get_axipkgs index_path"
  30 + exit(1)
  31 +
  32 + axi_path = sys.argv[1]
  33 + axi = xapian.Database(axi_path)
  34 + for n in range(1,axi.get_lastdocid()):
  35 + doc = 0
  36 + try:
  37 + doc = axi.get_document(n)
  38 + except:
  39 + pass
  40 + if doc:
  41 + xp_terms = [t.term for t in doc.termlist() if t.term.startswith("XP")]
  42 + print xp_terms[0].lstrip('XP')
... ...
src/bin/get_desktop.sh
1 1 #!/usr/bin/env bash
2 2 #
3   -# get_desktop.sh - get packages which have desktop files
  3 +# get_desktop.sh - get packages which have desktop files
  4 +#
  5 +# DEPRECATED: use get_axipkgs.py to get this info from axi
4 6  
5 7 cd /usr/share/app-install/desktop
6 8 sed -ne 's/X-AppInstall-Package=//p' * | sort -u | grep -v kdelibs | grep -v libfm-gtk0
... ...
src/bin/get_pkgs_inst.py
1 1 #!/usr/bin/env python
2 2 #
3 3 # get_pkgs_inst.py - get tuple (package,installation) from popcon results file
  4 +#
  5 +# results_file: org/popcon.debian.org/popcon-mail/results
4 6  
  7 +import sys
5 8 from operator import itemgetter
  9 +
6 10 if __name__ == '__main__':
  11 + if len(sys.argv)<2:
  12 + print "Usage: get_pkgs_inst popcon_results_path"
  13 + exit(1)
  14 +
  15 + results_path = sys.argv[1]
7 16 pkgs_inst = {}
8   - with open("/root/org/popcon.debian.org/popcon-mail/results") as results:
  17 + with open(results_path) as results:
9 18 for line in results:
10 19 if line.startswith("Package"):
11 20 fields = line.split()
12 21 inst = int(fields[2])+int(fields[3])+int(fields[4])
13   - if inst > 20:
14   - pkgs_inst[fields[1]] = inst
  22 + pkgs_inst[fields[1]] = inst
15 23 sorted_by_inst = sorted(pkgs_inst.items(), key=itemgetter(1))
16 24 for pkg, inst in sorted_by_inst:
17 25 print pkg, inst
... ...
src/bin/indexer_axi.py 0 → 100755
... ... @@ -0,0 +1,77 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + indexer.py - generate xapian indexes to be used as items and users
  4 + repositories
  5 +"""
  6 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  7 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  8 +__license__ = """
  9 + This program is free software: you can redistribute it and/or modify
  10 + it under the terms of the GNU General Public License as published by
  11 + the Free Software Foundation, either version 3 of the License, or
  12 + (at your option) any later version.
  13 +
  14 + This program is distributed in the hope that it will be useful,
  15 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17 + GNU General Public License for more details.
  18 +
  19 + You should have received a copy of the GNU General Public License
  20 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  21 +"""
  22 +
  23 +import os
  24 +import sys
  25 +sys.path.insert(0,'../')
  26 +import datetime
  27 +
  28 +from config import Config
  29 +from error import Error
  30 +import data
  31 +import xapian
  32 +
  33 +if __name__ == '__main__':
  34 + axi_path = "/var/lib/apt-xapian-index/index"
  35 + axi = xapian.Database(axi_path)
  36 + base_dir = os.path.expanduser("~/.app-recommender/")
  37 +
  38 + begin_time = datetime.datetime.now()
  39 +
  40 + # axi sample based on the pkgs sample provided by command line
  41 + if "sample" in sys.argv:
  42 + print ("Sample package indexing started at %s" % begin_time)
  43 + if len(sys.argv) > 2:
  44 + pkgs_filter = sys.argv[2]
  45 + else:
  46 + print "Usage: indexer axi_sample pkgs_sample_file"
  47 + exit(1)
  48 + with open(pkgs_filter) as valid:
  49 + pkgs_list = [line.strip() for line in valid]
  50 + filter_str = pkgs_filter.split("/")[-1]
  51 + index = data.SampleAptXapianIndex(pkgs_list,axi,
  52 + os.path.join(base_dir,"axi_"+filter_str))
  53 + print ("Axi size: %d" % axi.get_doccount())
  54 + print ("Packages list length: %d" % len(pkgs_list))
  55 + print ("Sample index size: %d" %
  56 + index.get_doccount())
  57 +
  58 + # axi filtered by terms provided by command line
  59 + if "filter" in sys.argv:
  60 + print ("Filtered package indexing started at %s" % begin_time)
  61 + if len(sys.argv) > 2:
  62 + terms = sys.argv[2:]
  63 + else:
  64 + print ("Usage: indexer axi_filter term [additional terms]")
  65 + exit(1)
  66 + terms_str = "_".join([t.split("::")[-1] for t in terms])
  67 + index = data.FilteredXapianIndex(terms,axi,
  68 + os.path.join(base_dir,"axi_"+terms_str))
  69 + print ("Axi size: %d" % axi.get_doccount())
  70 + print ("Terms filter: %s" % terms)
  71 + print ("Filtered index size: %d" %
  72 + index.get_doccount())
  73 +
  74 + end_time = datetime.datetime.now()
  75 + print ("Indexing completed at %s" % end_time)
  76 + delta = end_time - begin_time
  77 + print ("Time elapsed: %d seconds." % delta.seconds)
... ...
src/bin/indexer_popcon.py 0 → 100755
... ... @@ -0,0 +1,52 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + popindex.py - generate a popcon index to be used by the recommender as the
  4 + users repository, based on filters provided by config
  5 +"""
  6 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  7 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  8 +__license__ = """
  9 + This program is free software: you can redistribute it and/or modify
  10 + it under the terms of the GNU General Public License as published by
  11 + the Free Software Foundation, either version 3 of the License, or
  12 + (at your option) any later version.
  13 +
  14 + This program is distributed in the hope that it will be useful,
  15 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17 + GNU General Public License for more details.
  18 +
  19 + You should have received a copy of the GNU General Public License
  20 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  21 +"""
  22 +import os
  23 +import sys
  24 +sys.path.insert(0,'../')
  25 +import logging
  26 +import datetime
  27 +
  28 +from config import Config
  29 +from data import FilteredPopconXapianIndex
  30 +
  31 +if __name__ == '__main__':
  32 + base_dir = os.path.expanduser("~/.app-recommender/")
  33 + axi_path = os.path.join(base_dir,"axi_XD")
  34 + path = os.path.join(base_dir,"popcon_XD")
  35 + popcon_dir = os.path.join(base_dir,"popcon-entries")
  36 + tags_filter = os.path.join(base_dir,"filters/debtags")
  37 +
  38 + # set up config for logging
  39 + cfg = Config()
  40 +
  41 + begin_time = datetime.datetime.now()
  42 + logging.info("Popcon indexing started at %s" % begin_time)
  43 + # use config file or command line options
  44 + index = FilteredPopconXapianIndex(path,popcon_dir,axi_path,tags_filter)
  45 +
  46 + end_time = datetime.datetime.now()
  47 + logging.info("Popcon indexing completed at %s" % end_time)
  48 + logging.info("Number of documents (submissions): %d" %
  49 + index.get_doccount())
  50 +
  51 + delta = end_time - begin_time
  52 + logging.info("Time elapsed: %d seconds." % delta.seconds)
... ...
src/config.py
... ... @@ -40,7 +40,7 @@ class Config(Singleton):
40 40 ## general options
41 41 self.debug = 0
42 42 self.verbose = 1
43   - self.output = "log"
  43 + self.output = "apprec.log"
44 44  
45 45 ## data_source options
46 46 self.base_dir = os.path.expanduser("~/.app-recommender/")
... ... @@ -103,13 +103,14 @@ class Config(Singleton):
103 103 print " -f, --filtersdir=PATH Path to filters directory"
104 104 print " -b, --pkgsfilter=FILTER File containing packages to be considered for recommendations"
105 105 print " -a, --axi=PATH Path to apt-xapian-index"
106   - print " -e, --dde=URL DDE url"
107 106 print " -p, --popconindex=PATH Path to popcon index"
108   - print " -m, --popcondir=PATH Path to popcon submissions dir"
109   - print " -u, --indexmode=MODE 'old'|'reindex'|'cluster'|'recluster'"
110   - print " -l, --clustersdir=PATH Path to popcon clusters dir"
111   - print " -c, --medoids=k Number of medoids for clustering"
112   - print " -x, --maxpopcon=k Number of submissions to be considered"
  107 + print " -e, --dde=URL DDE url"
  108 + # deprecated options
  109 + #print " -m, --popcondir=PATH Path to popcon submissions dir"
  110 + #print " -u, --indexmode=MODE 'old'|'reindex'|'cluster'|'recluster'"
  111 + #print " -l, --clustersdir=PATH Path to popcon clusters dir"
  112 + #print " -c, --medoids=k Number of medoids for clustering"
  113 + #print " -x, --maxpopcon=k Number of submissions to be considered"
113 114 print ""
114 115 print " [ recommender ]"
115 116 print " -w, --weight=OPTION Search weighting scheme"
... ... @@ -123,11 +124,19 @@ class Config(Singleton):
123 124 print " bm25 = bm25 weighting scheme"
124 125 print ""
125 126 print " [ strategy options ] "
126   - print " cb = content-based "
127   - print " cbt = content-based using only tags as content "
128   - print " cbd = content-based using only package descriptions as content "
129   - print " col = collaborative "
130   - print " colct = collaborative through tags content "
  127 + print " cb = content-based, mixed profile"
  128 + print " cbt = content-based, tags only profile"
  129 + print " cbd = content-based, description terms only profile"
  130 + print " cbh = content-based, half-half profile"
  131 + print " cb_eset = cb with eset profiling"
  132 + print " cbt_eset = cbt with eset profiling"
  133 + print " cbd_eset = cbd_eset with eset profiling"
  134 + print " cbh_eset = cbh with eset profiling"
  135 + print " knn = collaborative, tf-idf knn"
  136 + print " knn_plus = collaborative, tf-idf weighted knn"
  137 + print " knn_eset = collaborative, eset knn"
  138 + print " knnco = collaborative through content"
  139 + print " knnco_eset = collaborative through content, eset recommendation"
131 140  
132 141 def read_option(self, section, option):
133 142 """
... ...
src/data.py
... ... @@ -30,12 +30,26 @@ import shutil
30 30 import apt
31 31 import re
32 32 import operator
  33 +import urllib
  34 +import simplejson as json
33 35  
34 36 from error import Error
35 37 from singleton import Singleton
36 38 from dissimilarity import *
37 39 from config import Config
38 40  
  41 +def axi_get_pkgs(axi):
  42 + pkgs_names = []
  43 + for docid in range(1,axi.get_lastdocid()+1):
  44 + try:
  45 + doc = axi.get_document(docid)
  46 + except:
  47 + pass
  48 + docterms_XP = [t.term for t in doc.termlist()
  49 + if t.term.startswith("XP")]
  50 + pkgs_names.append(docterms_XP[0].lstrip('XP'))
  51 + return pkgs_names
  52 +
39 53 def axi_search_pkgs(axi,pkgs_list):
40 54 terms = ["XP"+item for item in pkgs_list]
41 55 query = xapian.Query(xapian.Query.OP_OR, terms)
... ... @@ -110,30 +124,39 @@ def tfidf_plus(index,docs,content_filter):
110 124 variance = sum([(p-mean)*(p-mean) for p in population])/len(population)
111 125 standard_deviation = math.sqrt(variance)
112 126 for d in docs:
113   - normalized_weigths[d.docid] = d.weight/standard_deviation
  127 + if standard_deviation>1:
  128 + # values between [0-1] would cause the opposite effect
  129 + normalized_weigths[d.docid] = d.weight/standard_deviation
  130 + else:
  131 + normalized_weigths[d.docid] = d.weight
114 132 return tfidf_weighting(index,docs,content_filter,normalized_weigths)
115 133  
116   -class AppAptXapianIndex(xapian.WritableDatabase):
  134 +class FilteredXapianIndex(xapian.WritableDatabase):
117 135 """
118   - Data source for application packages information
  136 + Filtered Xapian Index
119 137 """
120   - def __init__(self,axi_path,path):
  138 + def __init__(self,terms,index_path,path):
121 139 xapian.WritableDatabase.__init__(self,path,
122 140 xapian.DB_CREATE_OR_OVERWRITE)
123   - axi = xapian.Database(axi_path)
124   - logging.info("AptXapianIndex size: %d" % axi.get_doccount())
125   - for docid in range(1,axi.get_lastdocid()+1):
  141 + index = xapian.Database(index_path)
  142 + for docid in range(1,index.get_lastdocid()+1):
126 143 try:
127   - doc = axi.get_document(docid)
128   - allterms = [term.term for term in doc.termlist()]
129   - if "XTrole::program" in allterms:
  144 + doc = index.get_document(docid)
  145 + docterms = [term.term for term in doc.termlist()]
  146 + tagged = False
  147 + for t in terms:
  148 + if t in docterms:
  149 + tagged = True
  150 + if tagged:
130 151 self.add_document(doc)
131 152 logging.info("Added doc %d." % docid)
132 153 else:
133 154 logging.info("Discarded doc %d." % docid)
134 155 except:
135 156 logging.info("Doc %d not found in axi." % docid)
136   - logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." %
  157 + logging.info("Filter: %s" % terms)
  158 + logging.info("Index size: %d" % index.get_doccount())
  159 + logging.info("Filtered Index size: %d (lastdocid: %d)." %
137 160 (self.get_doccount(), self.get_lastdocid()))
138 161  
139 162 def __str__(self):
... ... @@ -186,13 +209,13 @@ class DebianPackage():
186 209 if pkg_version.record.has_key('Conflicts'):
187 210 self.conflicts = pkg_version.record['Conflicts']
188 211 if pkg_version.record.has_key('Replaces'):
189   - self.conflicts = pkg_version.record['Replaces']
  212 + self.replaces = pkg_version.record['Replaces']
190 213 if pkg_version.record.has_key('Provides'):
191 214 self.provides = pkg_version.record['Provides']
192 215  
193 216 def load_details_from_dde(self,dde_server,dde_port):
194   - json_data = json.load(urllib.urlopen("http://%s:%s/q/udd/packages/all/%s?t=json"
195   - % dde_server,dde_port,self.name))
  217 + json_data = json.load(urllib.urlopen("http://%s:%d/q/udd/packages/prio-debian-sid/%s?t=json"
  218 + % (dde_server,dde_port,self.name)))
196 219  
197 220 self.maintainer = json_data['r']['maintainer']
198 221 self.version = json_data['r']['version']
... ... @@ -200,27 +223,27 @@ class DebianPackage():
200 223 self.description = self.format_description(json_data['r']['long_description'])
201 224 self.section = json_data['r']['section']
202 225 if json_data['r']['homepage']:
203   - self.conflicts = json_data['r']['homepage']
  226 + self.homepage = json_data['r']['homepage']
204 227 if json_data['r']['tag']:
205 228 self.tags = self.debtags_list_to_dict(json_data['r']['tag'])
206 229 if json_data['r']['depends']:
207 230 self.depends = json_data['r']['depends']
208 231 if json_data['r']['pre_depends']:
209   - self.conflicts = json_data['r']['pre_depends']
  232 + self.predepends = json_data['r']['pre_depends']
210 233 if json_data['r']['recommends']:
211   - self.conflicts = json_data['r']['recommends']
  234 + self.recommends = json_data['r']['recommends']
212 235 if json_data['r']['suggests']:
213   - self.conflicts = json_data['r']['suggests']
  236 + self.suggests = json_data['r']['suggests']
214 237 if json_data['r']['conflicts']:
215 238 self.conflicts = json_data['r']['conflicts']
216 239 if json_data['r']['replaces']:
217   - self.conflicts = json_data['r']['replaces']
  240 + self.replaces = json_data['r']['replaces']
218 241 if json_data['r']['provides']:
219   - self.conflicts = json_data['r']['provides']
  242 + self.provides = json_data['r']['provides']
220 243 self.popcon_insts = json_data['r']['popcon']['insts']
221 244  
222 245 def format_description(self,description):
223   - return description.replace('.\n','').replace('\n','<br />')
  246 + return description.replace(' .\n','<br />').replace('\n','<br />')
224 247  
225 248 def debtags_str_to_dict(self, debtags_str):
226 249 debtags_list = [tag.rstrip(",") for tag in debtags_str.split()]
... ... @@ -281,6 +304,7 @@ class PopconSubmission():
281 304 for line in submission:
282 305 if line.startswith("POPULARITY"):
283 306 self.user_id = line.split()[2].lstrip("ID:")
  307 + self.arch = line.split()[3].lstrip("ARCH:")
284 308 elif not line.startswith("END-POPULARITY"):
285 309 data = line.rstrip('\n').split()
286 310 if len(data) > 2:
... ... @@ -304,6 +328,82 @@ class PopconSubmission():
304 328 elif data[4] == '<RECENT-CTIME>':
305 329 self.packages[pkg] = 8
306 330  
  331 +class FilteredPopconXapianIndex(xapian.WritableDatabase):
  332 + """
  333 + Data source for popcon submissions defined as a xapian database.
  334 + """
  335 + def __init__(self,path,popcon_dir,axi_path,tags_filter):
  336 + """
  337 + Set initial attributes.
  338 + """
  339 + self.axi = xapian.Database(axi_path)
  340 + self.path = os.path.expanduser(path)
  341 + self.popcon_dir = os.path.expanduser(popcon_dir)
  342 + self.valid_pkgs = axi_get_pkgs(self.axi)
  343 + logging.debug("Considering %d valid packages" % len(self.valid_pkgs))
  344 + with open(tags_filter) as valid_tags:
  345 + self.valid_tags = [line.strip() for line in valid_tags
  346 + if not line.startswith("#")]
  347 + logging.debug("Considering %d valid tags" % len(self.valid_tags))
  348 + if not os.path.exists(self.popcon_dir):
  349 + os.makedirs(self.popcon_dir)
  350 + if not os.listdir(self.popcon_dir):
  351 + logging.critical("Popcon dir seems to be empty.")
  352 + raise Error
  353 +
  354 + # set up directory
  355 + shutil.rmtree(self.path,1)
  356 + os.makedirs(self.path)
  357 + try:
  358 + logging.info("Indexing popcon submissions from \'%s\'" %
  359 + self.popcon_dir)
  360 + logging.info("Creating new xapian index at \'%s\'" %
  361 + self.path)
  362 + xapian.WritableDatabase.__init__(self,self.path,
  363 + xapian.DB_CREATE_OR_OVERWRITE)
  364 + except xapian.DatabaseError as e:
  365 + logging.critical("Could not create popcon xapian index.")
  366 + logging.critical(str(e))
  367 + raise Error
  368 +
  369 + # build new index
  370 + doc_count = 0
  371 + for root, dirs, files in os.walk(self.popcon_dir):
  372 + for popcon_file in files:
  373 + submission = PopconSubmission(os.path.join(root, popcon_file))
  374 + doc = xapian.Document()
  375 + submission_pkgs = submission.get_filtered(self.valid_pkgs)
  376 + if len(submission_pkgs) < 10:
  377 + logging.debug("Low profile popcon submission \'%s\' (%d)" %
  378 + (submission.user_id,len(submission_pkgs)))
  379 + else:
  380 + doc.set_data(submission.user_id)
  381 + doc.add_term("ID"+submission.user_id)
  382 + doc.add_term("ARCH"+submission.arch)
  383 + logging.debug("Parsing popcon submission \'%s\'" %
  384 + submission.user_id)
  385 + for pkg,freq in submission_pkgs.items():
  386 + tags = axi_search_pkg_tags(self.axi,pkg)
  387 + # if the package was found in axi
  388 + if tags:
  389 + doc.add_term("XP"+pkg,freq)
  390 + # if the package has tags associated with it
  391 + if not tags == "notags":
  392 + for tag in tags:
  393 + if tag.lstrip("XT") in self.valid_tags:
  394 + doc.add_term(tag,freq)
  395 + doc_id = self.add_document(doc)
  396 + doc_count += 1
  397 + logging.debug("Popcon Xapian: Indexing doc %d" % doc_id)
  398 + # python garbage collector
  399 + gc.collect()
  400 + # flush to disk database changes
  401 + try:
  402 + self.commit()
  403 + except:
  404 + self.flush() # deprecated function, used for compatibility with old lib version
  405 +
  406 +# Deprecated class, must be reviewed
307 407 class PopconXapianIndex(xapian.WritableDatabase):
308 408 """
309 409 Data source for popcon submissions defined as a singleton xapian database.
... ...
src/evaluation.py
... ... @@ -140,6 +140,29 @@ class FPR(Metric):
140 140 return (float(len(evaluation.false_positive))/
141 141 evaluation.real_negative_len)
142 142  
  143 +class MCC(Metric):
  144 + """
  145 + Matthews correlation coefficient.
  146 + """
  147 + def __init__(self):
  148 + """
  149 + Set metric description.
  150 + """
  151 + self.desc = " MCC "
  152 +
  153 + def run(self,evaluation):
  154 + """
  155 + Compute metric.
  156 + """
  157 + VP = len(evaluation.true_positive)
  158 + FP = len(evaluation.false_positive)
  159 + FN = len(evaluation.false_negative)
  160 + VN = evaluation.true_negative_len
  161 + if (VP+FP)==0 or (VP+FN)==0 or (VN+FP)==0 or (VN+FN)==0:
  162 + return 0
  163 + MCC = (((VP*VN)-(FP*FN))/math.sqrt((VP+FP)*(VP+FN)*(VN+FP)*(VN+FN)))
  164 + return MCC
  165 +
143 166 class F_score(Metric):
144 167 """
145 168 Classification accuracy metric which correlates precision and recall into an
... ...
src/experiments/deprecated/README 0 → 100644
... ... @@ -0,0 +1,2 @@
  1 +Experiments handled by expsuite:
  2 +https://github.com/rueckstiess/expsuite
... ...
src/experiments/deprecated/clustering-suite.py 0 → 100755
... ... @@ -0,0 +1,51 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +import os
  24 +sys.path.insert(0,'../')
  25 +from config import Config
  26 +from data import PopconXapianIndex, PopconSubmission
  27 +from recommender import Recommender
  28 +from user import LocalSystem, User
  29 +from evaluation import *
  30 +import logging
  31 +import random
  32 +import Gnuplot
  33 +
  34 +if __name__ == '__main__':
  35 +
  36 + cfg = Config()
  37 + cfg.index_mode = "recluster"
  38 + logging.info("Starting clustering experiments")
  39 + logging.info("Medoids: %d\t Max popcon:%d" % (cfg.k_medoids,cfg.max_popcon))
  40 + cfg.popcon_dir = os.path.expanduser("~/org/popcon.debian.org/popcon-mail/popcon-entries/")
  41 + cfg.popcon_index = cfg.popcon_index+("_%dmedoids%dmax" %
  42 + (cfg.k_medoids,cfg.max_popcon))
  43 + cfg.clusters_dir = cfg.clusters_dir+("_%dmedoids%dmax" %
  44 + (cfg.k_medoids,cfg.max_popcon))
  45 + pxi = PopconXapianIndex(cfg)
  46 + logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion)
  47 + # Write clustering log
  48 + output = open(("results/clustering/%dmedoids%dmax" % (cfg.k_medoids,cfg.max_popcon)),'w')
  49 + output.write("# k_medoids\tmax_popcon\tdispersion\n")
  50 + output.write("%d %f\n" % (cfg.k_medoids,cfg.max_popcon,pxi.cluster_dispersion))
  51 + output.close()
... ...
src/experiments/deprecated/experiments.cfg 0 → 100644
... ... @@ -0,0 +1,27 @@
  1 +[DEFAULT]
  2 +repetitions = 1
  3 +iterations = 10
  4 +path = 'results'
  5 +experiment = 'grid'
  6 +weight = ['bm25', 'trad']
  7 +;profile_size = range(10,100,10)
  8 +;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
  9 +sample = [0.6, 0.7, 0.8, 0.9]
  10 +
  11 +[content]
  12 +strategy = ['cb','cbt','cbd']
  13 +
  14 +[clustering]
  15 +experiment = 'single'
  16 +;iterations = 4
  17 +;medoids = range(2,6)
  18 +iterations = 6
  19 +medoids = [100,500,1000,5000,10000,50000]
  20 +;disabled for this experiment
  21 +weight = 0
  22 +profile_size = 0
  23 +sample = 0
  24 +
  25 +[colaborative]
  26 +users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"]
  27 +neighbors = range(10,1010,50)
... ...
src/experiments/deprecated/runner.py 0 → 100755
... ... @@ -0,0 +1,171 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import expsuite
  23 +import sys
  24 +sys.path.insert(0,'../')
  25 +from config import Config
  26 +from data import PopconXapianIndex, PopconSubmission
  27 +from recommender import Recommender
  28 +from user import LocalSystem, User
  29 +from evaluation import *
  30 +import logging
  31 +import random
  32 +import Gnuplot
  33 +
  34 +class ClusteringSuite(expsuite.PyExperimentSuite):
  35 + def reset(self, params, rep):
  36 + self.cfg = Config()
  37 + self.cfg.popcon_index = "../tests/test_data/.sample_pxi"
  38 + self.cfg.popcon_dir = "../tests/test_data/popcon_dir"
  39 + self.cfg.clusters_dir = "../tests/test_data/clusters_dir"
  40 +
  41 + if params['name'] == "clustering":
  42 + logging.info("Starting 'clustering' experiments suite...")
  43 + self.cfg.index_mode = "recluster"
  44 +
  45 + def iterate(self, params, rep, n):
  46 + if params['name'] == "clustering":
  47 + logging.info("Running iteration %d" % params['medoids'][n])
  48 + self.cfg.k_medoids = params['medoids'][n]
  49 + pxi = PopconXapianIndex(self.cfg)
  50 + result = {'k_medoids': params['medoids'][n],
  51 + 'dispersion': pxi.cluster_dispersion}
  52 + else:
  53 + result = {}
  54 + return result
  55 +
  56 +class ContentBasedSuite(expsuite.PyExperimentSuite):
  57 + def reset(self, params, rep):
  58 + if params['name'].startswith("content"):
  59 + cfg = Config()
  60 + #if the index was not built yet
  61 + #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi")
  62 + cfg.axi = "data/AppAxi"
  63 + cfg.index_mode = "old"
  64 + cfg.weight = params['weight']
  65 + self.rec = Recommender(cfg)
  66 + self.rec.set_strategy(params['strategy'])
  67 + self.repo_size = self.rec.items_repository.get_doccount()
  68 + self.user = LocalSystem()
  69 + self.user.app_pkg_profile(self.rec.items_repository)
  70 + self.user.no_auto_pkg_profile()
  71 + self.sample_size = int(len(self.user.pkg_profile)*params['sample'])
  72 + # iteration should be set to 10 in config file
  73 + #self.profile_size = range(10,101,10)
  74 +
  75 + def iterate(self, params, rep, n):
  76 + if params['name'].startswith("content"):
  77 + item_score = dict.fromkeys(self.user.pkg_profile,1)
  78 + # Prepare partition
  79 + sample = {}
  80 + for i in range(self.sample_size):
  81 + key = random.choice(item_score.keys())
  82 + sample[key] = item_score.pop(key)
  83 + # Get full recommendation
  84 + user = User(item_score)
  85 + recommendation = self.rec.get_recommendation(user,self.repo_size)
  86 + # Write recall log
  87 + recall_file = "results/content/recall/%s-%s-%.2f-%d" % \
  88 + (params['strategy'],params['weight'],params['sample'],n)
  89 + output = open(recall_file,'w')
  90 + output.write("# weight=%s\n" % params['weight'])
  91 + output.write("# strategy=%s\n" % params['strategy'])
  92 + output.write("# sample=%f\n" % params['sample'])
  93 + output.write("\n%d %d %d\n" % \
  94 + (self.repo_size,len(item_score),self.sample_size))
  95 + notfound = []
  96 + ranks = []
  97 + for pkg in sample.keys():
  98 + if pkg in recommendation.ranking:
  99 + ranks.append(recommendation.ranking.index(pkg))
  100 + else:
  101 + notfound.append(pkg)
  102 + for r in sorted(ranks):
  103 + output.write(str(r)+"\n")
  104 + if notfound:
  105 + output.write("Out of recommendation:\n")
  106 + for pkg in notfound:
  107 + output.write(pkg+"\n")
  108 + output.close()
  109 + # Plot metrics summary
  110 + accuracy = []
  111 + precision = []
  112 + recall = []
  113 + f1 = []
  114 + g = Gnuplot.Gnuplot()
  115 + g('set style data lines')
  116 + g.xlabel('Recommendation size')
  117 + for size in range(1,len(recommendation.ranking)+1,100):
  118 + predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
  119 + real = RecommendationResult(sample)
  120 + evaluation = Evaluation(predicted,real,self.repo_size)
  121 + accuracy.append([size,evaluation.run(Accuracy())])
  122 + precision.append([size,evaluation.run(Precision())])
  123 + recall.append([size,evaluation.run(Recall())])
  124 + f1.append([size,evaluation.run(F1())])
  125 + g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
  126 + Gnuplot.Data(precision,title="Precision"),
  127 + Gnuplot.Data(recall,title="Recall"),
  128 + Gnuplot.Data(f1,title="F1"))
  129 + g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1)
  130 + # Iteration log
  131 + result = {'iteration': n,
  132 + 'weight': params['weight'],
  133 + 'strategy': params['strategy'],
  134 + 'accuracy': accuracy[20],
  135 + 'precision': precision[20],
  136 + 'recall:': recall[20],
  137 + 'f1': f1[20]}
  138 + return result
  139 +
  140 +#class CollaborativeSuite(expsuite.PyExperimentSuite):
  141 +# def reset(self, params, rep):
  142 +# if params['name'].startswith("collaborative"):
  143 +#
  144 +# def iterate(self, params, rep, n):
  145 +# if params['name'].startswith("collaborative"):
  146 +# for root, dirs, files in os.walk(self.source_dir):
  147 +# for popcon_file in files:
  148 +# submission = PopconSubmission(os.path.join(root,popcon_file))
  149 +# user = User(submission.packages)
  150 +# user.maximal_pkg_profile()
  151 +# rec.get_recommendation(user)
  152 +# precision = 0
  153 +# result = {'weight': params['weight'],
  154 +# 'strategy': params['strategy'],
  155 +# 'profile_size': self.profile_size[n],
  156 +# 'accuracy': accuracy,
  157 +# 'precision': precision,
  158 +# 'recall:': recall,
  159 +# 'f1': }
  160 +# else:
  161 +# result = {}
  162 +# return result
  163 +
  164 +if __name__ == '__main__':
  165 +
  166 + if "clustering" in sys.argv or len(sys.argv)<3:
  167 + ClusteringSuite().start()
  168 + if "content" in sys.argv or len(sys.argv)<3:
  169 + ContentBasedSuite().start()
  170 + #if "collaborative" in sys.argv or len(sys.argv)<3:
  171 + #CollaborativeSuite().start()
... ...
src/experiments/extract-sample-db.py 0 → 100755
... ... @@ -0,0 +1,49 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + sample-popcon - extract a sample from popcon population
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import xapian
  23 +import os
  24 +import random
  25 +import sys
  26 +
  27 +if __name__ == '__main__':
  28 + try:
  29 + sample_file = sys.argv[1]
  30 + popcon = xapian.WritableDatabase(sys.argv[2],xapian.DB_OPEN)
  31 + except:
  32 + print "Usage: extract-sample-db sample_file popcon_index"
  33 + exit(1)
  34 + enquire = xapian.Enquire(popcon)
  35 + print sample_file.split("/")
  36 + new_popcon = xapian.WritableDatabase(sys.argv[2]+"-"+sample_file.split("/")[-1],xapian.DB_CREATE_OR_OVERWRITE)
  37 + print ("Popcon repository size: %d" % popcon.get_doccount())
  38 + for submission in open(sample_file):
  39 + print "ID"+submission.strip()
  40 + query = xapian.Query("ID"+submission.strip())
  41 + enquire.set_query(query)
  42 + mset = enquire.get_mset(0,20)
  43 + for m in mset:
  44 + print "Adding doc %s"%m.docid
  45 + new_popcon.add_document(popcon.get_document(m.docid))
  46 + print "Removing doc %s"%m.docid
  47 + popcon.delete_document(m.docid)
  48 + print ("Popcon repository size: %d" % popcon.get_doccount())
  49 + print ("Popcon repository size: %d" % new_popcon.get_doccount())
... ...
src/experiments/hybrid.py 0 → 100755
... ... @@ -0,0 +1,197 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + hybrid-suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +if __name__ == '__main__':
  35 + if len(sys.argv)<2:
  36 + print "Usage: hybrid strategy sample_file"
  37 + exit(1)
  38 +
  39 + iterations = 20
  40 + profile_size = [10,40,70,100,170,240]
  41 + neighbor_size = [3,10,50,100,200,400]
  42 +
  43 + #hybrid_strategies = ['knnco','knnco_eset']
  44 +
  45 + #iterations = 1
  46 + #profile_size = [10,20,30]
  47 + #neighbor_size = [10,20,30]
  48 +
  49 + cfg = Config()
  50 + population_sample = []
  51 + strategy = sys.argv[1]
  52 + sample_file = sys.argv[2]
  53 + sample_str = sample_file.split('/')[-1]
  54 + with open(sample_file,'r') as f:
  55 + for line in f.readlines():
  56 + user_id = line.strip('\n')
  57 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  58 + sample_dir = ("results/hybrid/%s" % sample_str)
  59 + if not os.path.exists(sample_dir):
  60 + os.makedirs(sample_dir)
  61 +
  62 + cfg.strategy = strategy
  63 + p_20_summary = {}
  64 + f05_100_summary = {}
  65 + c_20 = {}
  66 + c_100 = {}
  67 +
  68 + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
  69 + graph_20 = {}
  70 + graph_100 = {}
  71 + graph_20_jpg = {}
  72 + graph_100_jpg = {}
  73 + comment_20 = {}
  74 + comment_100 = {}
  75 + for k in neighbor_size:
  76 + graph_20[k] = log_file+("-neighboorhod%.3d-020.png"%k)
  77 + graph_100[k] = log_file+("-neighboorhod%.3d-100.png"%k)
  78 + graph_20_jpg[k] = graph_20[k].strip(".png")+".jpg"
  79 + graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
  80 + comment_20[k] = graph_20_jpg[k]+".comment"
  81 + comment_100[k] = graph_100_jpg[k]+".comment"
  82 +
  83 + with open(comment_20[k],'w') as f:
  84 + f.write("# %s\n" % sample_str)
  85 + f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
  86 + (cfg.strategy,iterations))
  87 + f.write("# neighboorhood\tprofile\tp_20\tc_20\n\n")
  88 + with open(comment_100[k],'w') as f:
  89 + f.write("# %s\n" % sample_str)
  90 + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
  91 + (cfg.strategy,iterations))
  92 + f.write("# neighboorhood\tprofile\tf05_100\tc_100\n\n")
  93 +
  94 + c_20[k] = {}
  95 + c_100[k] = {}
  96 + p_20_summary[k] = {}
  97 + f05_100_summary[k] = {}
  98 + for size in profile_size:
  99 + c_20[k][size] = set()
  100 + c_100[k][size] = set()
  101 + p_20_summary[k][size] = []
  102 + f05_100_summary[k][size] = []
  103 + with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'w') as f:
  104 + f.write("# %s\n" % sample_str)
  105 + f.write("# strategy %s-neighboorhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
  106 + f.write("# p_20\t\tf05_100\n\n")
  107 +
  108 + # main loop per user
  109 + for submission_file in population_sample:
  110 + user = PopconSystem(submission_file)
  111 + user.filter_pkg_profile(cfg.pkgs_filter)
  112 + user.maximal_pkg_profile()
  113 + for k in neighbor_size:
  114 + cfg.k_neighbors = k
  115 + for size in profile_size:
  116 + cfg.profile_size = size
  117 + rec = Recommender(cfg)
  118 + repo_size = rec.items_repository.get_doccount()
  119 + p_20 = []
  120 + f05_100 = []
  121 + for n in range(iterations):
  122 + # Fill sample profile
  123 + profile_len = len(user.pkg_profile)
  124 + item_score = {}
  125 + for pkg in user.pkg_profile:
  126 + item_score[pkg] = user.item_score[pkg]
  127 + sample = {}
  128 + sample_size = int(profile_len*0.9)
  129 + for i in range(sample_size):
  130 + key = random.choice(item_score.keys())
  131 + sample[key] = item_score.pop(key)
  132 + iteration_user = User(item_score)
  133 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  134 + if hasattr(recommendation,"ranking"):
  135 + ranking = recommendation.ranking
  136 + real = RecommendationResult(sample)
  137 + predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
  138 + evaluation = Evaluation(predicted_20,real,repo_size)
  139 + p_20.append(evaluation.run(Precision()))
  140 + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
  141 + evaluation = Evaluation(predicted_100,real,repo_size)
  142 + f05_100.append(evaluation.run(F_score(0.5)))
  143 + c_20[k][size] = c_20[k][size].union(recommendation.ranking[:20])
  144 + c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
  145 + # save summary
  146 + if p_20:
  147 + p_20_summary[k][size].append(sum(p_20)/len(p_20))
  148 + if f05_100:
  149 + f05_100_summary[k][size].append(sum(f05_100)/len(f05_100))
  150 +
  151 + with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'a') as f:
  152 + f.write("%.4f\t\t%.4f\n" %
  153 + ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
  154 +
  155 + # back to main flow
  156 + coverage_20 = {}
  157 + coverage_100 = {}
  158 + for k in neighbor_size:
  159 + coverage_20[k] = {}
  160 + coverage_100[k] = {}
  161 + with open(comment_20[k],'a') as f:
  162 + for size in profile_size:
  163 + coverage_20[k][size] = len(c_20[k][size])/float(repo_size)
  164 + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
  165 + (k,size,float(sum(p_20_summary[k][size]))/len(p_20_summary[k][size]),coverage_20[k][size]))
  166 + with open(comment_100[k],'a') as f:
  167 + for size in profile_size:
  168 + coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
  169 + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
  170 + (k,size,float(sum(f05_100_summary[k][size]))/len(f05_100_summary[k][size]),coverage_100[k][size]))
  171 +
  172 + for k in neighbor_size:
  173 + # plot results summary
  174 + g = Gnuplot.Gnuplot()
  175 + g('set style data lines')
  176 + g('set yrange [0:1.0]')
  177 + g.xlabel('Profile size')
  178 + g.title("Setup: %s-neighboorhood%3d (threshold 20)" % (cfg.strategy,k))
  179 + g.plot(Gnuplot.Data(sorted([[i,sum(p_20_summary[k][i])/len(p_20_summary[k][i])]
  180 + for i in p_20_summary[k].keys()]),title="Precision"),
  181 + Gnuplot.Data(sorted([[i,coverage_20[k][i]]
  182 + for i in coverage_20[k].keys()]),title="Coverage"))
  183 + g.hardcopy(graph_20[k],terminal="png")
  184 + #commands.getoutput("convert -quality 100 %s %s" %
  185 + # (graph_20[k],graph_20_jpg[k]))
  186 + g = Gnuplot.Gnuplot()
  187 + g('set style data lines')
  188 + g('set yrange [0:1.0]')
  189 + g.xlabel('Profile size')
  190 + g.title("Setup: %s-neighboorhood%3d (threshold 100)" % (cfg.strategy,k))
  191 + g.plot(Gnuplot.Data(sorted([[i,sum(f05_100_summary[k][i])/len(f05_100_summary[k][i])]
  192 + for i in f05_100_summary[k].keys()]),title="F05"),
  193 + Gnuplot.Data(sorted([[i,coverage_100[k][i]]
  194 + for i in coverage_100[k].keys()]),title="Coverage"))
  195 + g.hardcopy(graph_100[k],terminal="png")
  196 + #commands.getoutput("convert -quality 100 %s %s" %
  197 + # (graph_100[k],graph_100_jpg[k]))
... ...
src/experiments/k-suite.py 0 → 100755
... ... @@ -0,0 +1,186 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + k-suite - experiment different neighborhood sizes
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +def plot_roc(k,roc_points,log_file):
  35 + g = Gnuplot.Gnuplot()
  36 + g('set style data points')
  37 + g.xlabel('False Positive Rate')
  38 + g.ylabel('True Positive Rate')
  39 + g('set xrange [0:1.0]')
  40 + g('set yrange [0:1.0]')
  41 + g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
  42 + g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
  43 + Gnuplot.Data(roc_points))
  44 + g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
  45 + g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
  46 +
  47 +def plot_summary(precision,f05,mcc,log_file):
  48 + g = Gnuplot.Gnuplot()
  49 + g('set style data lines')
  50 + g.xlabel('Neighborhood (k)')
  51 + g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
  52 + g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
  53 + Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
  54 + Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
  55 + g.hardcopy(log_file+(".png"),terminal="png")
  56 + g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
  57 +
  58 +class ExperimentResults:
  59 + def __init__(self,repo_size):
  60 + self.repository_size = repo_size
  61 + self.precision = []
  62 + self.recall = []
  63 + self.fpr = []
  64 + self.f05 = []
  65 + self.mcc = []
  66 +
  67 + def add_result(self,ranking,sample):
  68 + predicted = RecommendationResult(dict.fromkeys(ranking,1))
  69 + real = RecommendationResult(sample)
  70 + evaluation = Evaluation(predicted,real,self.repository_size)
  71 + self.precision.append(evaluation.run(Precision()))
  72 + self.recall.append(evaluation.run(Recall()))
  73 + self.fpr.append(evaluation.run(FPR()))
  74 + self.f05.append(evaluation.run(F_score(0.5)))
  75 + self.mcc.append(evaluation.run(MCC()))
  76 +
  77 + def get_roc_point(self):
  78 + tpr = self.recall
  79 + fpr = self.fpr
  80 + if not tpr or not fpr:
  81 + return [0,0]
  82 + return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
  83 +
  84 + def get_precision_summary(self):
  85 + if not self.precision: return 0
  86 + return sum(self.precision)/len(self.precision)
  87 +
  88 + def get_f05_summary(self):
  89 + if not self.f05: return 0
  90 + return sum(self.f05)/len(self.f05)
  91 +
  92 + def get_mcc_summary(self):
  93 + if not self.mcc: return 0
  94 + return sum(self.mcc)/len(self.mcc)
  95 +
  96 +if __name__ == '__main__':
  97 + if len(sys.argv)<3:
  98 + print "Usage: k-suite strategy_str sample_file"
  99 + exit(1)
  100 + threshold = 20
  101 + iterations = 30
  102 + neighbors = [3,5,10,50,100,150,200,300,400,500]
  103 + cfg = Config()
  104 + cfg.strategy = sys.argv[1]
  105 + sample_file = sys.argv[2]
  106 + population_sample = []
  107 + with open(sample_file,'r') as f:
  108 + for line in f.readlines():
  109 + user_id = line.strip('\n')
  110 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  111 + # setup dictionaries and files
  112 + roc_summary = {}
  113 + recommended = {}
  114 + precision_summary = {}
  115 + f05_summary = {}
  116 + mcc_summary = {}
  117 + sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
  118 + if not os.path.exists(sample_dir):
  119 + os.makedirs(sample_dir)
  120 + log_file = os.path.join(sample_dir,cfg.strategy)
  121 + with open(log_file,'w') as f:
  122 + f.write("# %s\n\n" % sample_file.split('/')[-1])
  123 + f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
  124 + (cfg.strategy,threshold,iterations))
  125 + f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
  126 +
  127 + for k in neighbors:
  128 + roc_summary[k] = []
  129 + recommended[k] = set()
  130 + precision_summary[k] = []
  131 + f05_summary[k] = []
  132 + mcc_summary[k] = []
  133 + with open(log_file+"-k%.3d"%k,'w') as f:
  134 + f.write("# %s\n\n" % sample_file.split('/')[-1])
  135 + f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
  136 + f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
  137 +
  138 + # main loop per user
  139 + for submission_file in population_sample:
  140 + user = PopconSystem(submission_file)
  141 + user.filter_pkg_profile(cfg.pkgs_filter)
  142 + user.maximal_pkg_profile()
  143 + for k in neighbors:
  144 + cfg.k_neighbors = k
  145 + rec = Recommender(cfg)
  146 + repo_size = rec.items_repository.get_doccount()
  147 + results = ExperimentResults(repo_size)
  148 + # n iterations for same recommender and user
  149 + for n in range(iterations):
  150 + # Fill sample profile
  151 + profile_len = len(user.pkg_profile)
  152 + item_score = {}
  153 + for pkg in user.pkg_profile:
  154 + item_score[pkg] = user.item_score[pkg]
  155 + sample = {}
  156 + sample_size = int(profile_len*0.9)
  157 + for i in range(sample_size):
  158 + key = random.choice(item_score.keys())
  159 + sample[key] = item_score.pop(key)
  160 + iteration_user = User(item_score)
  161 + recommendation = rec.get_recommendation(iteration_user,threshold)
  162 + if hasattr(recommendation,"ranking"):
  163 + results.add_result(recommendation.ranking,sample)
  164 + recommended[k] = recommended[k].union(recommendation.ranking)
  165 + # save summary
  166 + roc_point = results.get_roc_point()
  167 + roc_summary[k].append(roc_point)
  168 + precision = results.get_precision_summary()
  169 + precision_summary[k].append(precision)
  170 + f05 = results.get_f05_summary()
  171 + f05_summary[k].append(f05)
  172 + mcc = results.get_mcc_summary()
  173 + mcc_summary[k].append(mcc)
  174 + with open(log_file+"-k%.3d"%k,'a') as f:
  175 + f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
  176 + (roc_point[0],roc_point[1],precision,f05,mcc))
  177 + # back to main flow
  178 + with open(log_file,'a') as f:
  179 + plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
  180 + for k in neighbors:
  181 + coverage = len(recommended[size])/float(repo_size)
  182 + f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
  183 + (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
  184 + float(sum(f05_summary[k]))/len(f05_summary[k]),
  185 + float(sum(mcc_summary[k]))/len(mcc_summary[k])))
  186 + plot_roc(k,roc_summary[k],log_file)
... ...
src/experiments/legacy/clustering-suite.py 0 → 100755
... ... @@ -0,0 +1,51 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +import os
  24 +sys.path.insert(0,'../')
  25 +from config import Config
  26 +from data import PopconXapianIndex, PopconSubmission
  27 +from recommender import Recommender
  28 +from user import LocalSystem, User
  29 +from evaluation import *
  30 +import logging
  31 +import random
  32 +import Gnuplot
  33 +
  34 +if __name__ == '__main__':
  35 +
  36 + cfg = Config()
  37 + cfg.index_mode = "recluster"
  38 + logging.info("Starting clustering experiments")
  39 + logging.info("Medoids: %d\t Max popcon:%d" % (cfg.k_medoids,cfg.max_popcon))
  40 + cfg.popcon_dir = os.path.expanduser("~/org/popcon.debian.org/popcon-mail/popcon-entries/")
  41 + cfg.popcon_index = cfg.popcon_index+("_%dmedoids%dmax" %
  42 + (cfg.k_medoids,cfg.max_popcon))
  43 + cfg.clusters_dir = cfg.clusters_dir+("_%dmedoids%dmax" %
  44 + (cfg.k_medoids,cfg.max_popcon))
  45 + pxi = PopconXapianIndex(cfg)
  46 + logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion)
  47 + # Write clustering log
  48 + output = open(("results/clustering/%dmedoids%dmax" % (cfg.k_medoids,cfg.max_popcon)),'w')
  49 + output.write("# k_medoids\tmax_popcon\tdispersion\n")
  50 + output.write("%d %f\n" % (cfg.k_medoids,cfg.max_popcon,pxi.cluster_dispersion))
  51 + output.close()
... ...
src/experiments/legacy/experiments.cfg 0 → 100644
... ... @@ -0,0 +1,27 @@
  1 +[DEFAULT]
  2 +repetitions = 1
  3 +iterations = 10
  4 +path = 'results'
  5 +experiment = 'grid'
  6 +weight = ['bm25', 'trad']
  7 +;profile_size = range(10,100,10)
  8 +;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
  9 +sample = [0.6, 0.7, 0.8, 0.9]
  10 +
  11 +[content]
  12 +strategy = ['cb','cbt','cbd']
  13 +
  14 +[clustering]
  15 +experiment = 'single'
  16 +;iterations = 4
  17 +;medoids = range(2,6)
  18 +iterations = 6
  19 +medoids = [100,500,1000,5000,10000,50000]
  20 +;disabled for this experiment
  21 +weight = 0
  22 +profile_size = 0
  23 +sample = 0
  24 +
  25 +[colaborative]
  26 +users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"]
  27 +neighbors = range(10,1010,50)
... ...
src/experiments/legacy/runner.py 0 → 100755
... ... @@ -0,0 +1,171 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import expsuite
  23 +import sys
  24 +sys.path.insert(0,'../')
  25 +from config import Config
  26 +from data import PopconXapianIndex, PopconSubmission
  27 +from recommender import Recommender
  28 +from user import LocalSystem, User
  29 +from evaluation import *
  30 +import logging
  31 +import random
  32 +import Gnuplot
  33 +
  34 +class ClusteringSuite(expsuite.PyExperimentSuite):
  35 + def reset(self, params, rep):
  36 + self.cfg = Config()
  37 + self.cfg.popcon_index = "../tests/test_data/.sample_pxi"
  38 + self.cfg.popcon_dir = "../tests/test_data/popcon_dir"
  39 + self.cfg.clusters_dir = "../tests/test_data/clusters_dir"
  40 +
  41 + if params['name'] == "clustering":
  42 + logging.info("Starting 'clustering' experiments suite...")
  43 + self.cfg.index_mode = "recluster"
  44 +
  45 + def iterate(self, params, rep, n):
  46 + if params['name'] == "clustering":
  47 + logging.info("Running iteration %d" % params['medoids'][n])
  48 + self.cfg.k_medoids = params['medoids'][n]
  49 + pxi = PopconXapianIndex(self.cfg)
  50 + result = {'k_medoids': params['medoids'][n],
  51 + 'dispersion': pxi.cluster_dispersion}
  52 + else:
  53 + result = {}
  54 + return result
  55 +
  56 +class ContentBasedSuite(expsuite.PyExperimentSuite):
  57 + def reset(self, params, rep):
  58 + if params['name'].startswith("content"):
  59 + cfg = Config()
  60 + #if the index was not built yet
  61 + #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi")
  62 + cfg.axi = "data/AppAxi"
  63 + cfg.index_mode = "old"
  64 + cfg.weight = params['weight']
  65 + self.rec = Recommender(cfg)
  66 + self.rec.set_strategy(params['strategy'])
  67 + self.repo_size = self.rec.items_repository.get_doccount()
  68 + self.user = LocalSystem()
  69 + self.user.app_pkg_profile(self.rec.items_repository)
  70 + self.user.no_auto_pkg_profile()
  71 + self.sample_size = int(len(self.user.pkg_profile)*params['sample'])
  72 + # iteration should be set to 10 in config file
  73 + #self.profile_size = range(10,101,10)
  74 +
  75 + def iterate(self, params, rep, n):
  76 + if params['name'].startswith("content"):
  77 + item_score = dict.fromkeys(self.user.pkg_profile,1)
  78 + # Prepare partition
  79 + sample = {}
  80 + for i in range(self.sample_size):
  81 + key = random.choice(item_score.keys())
  82 + sample[key] = item_score.pop(key)
  83 + # Get full recommendation
  84 + user = User(item_score)
  85 + recommendation = self.rec.get_recommendation(user,self.repo_size)
  86 + # Write recall log
  87 + recall_file = "results/content/recall/%s-%s-%.2f-%d" % \
  88 + (params['strategy'],params['weight'],params['sample'],n)
  89 + output = open(recall_file,'w')
  90 + output.write("# weight=%s\n" % params['weight'])
  91 + output.write("# strategy=%s\n" % params['strategy'])
  92 + output.write("# sample=%f\n" % params['sample'])
  93 + output.write("\n%d %d %d\n" % \
  94 + (self.repo_size,len(item_score),self.sample_size))
  95 + notfound = []
  96 + ranks = []
  97 + for pkg in sample.keys():
  98 + if pkg in recommendation.ranking:
  99 + ranks.append(recommendation.ranking.index(pkg))
  100 + else:
  101 + notfound.append(pkg)
  102 + for r in sorted(ranks):
  103 + output.write(str(r)+"\n")
  104 + if notfound:
  105 + output.write("Out of recommendation:\n")
  106 + for pkg in notfound:
  107 + output.write(pkg+"\n")
  108 + output.close()
  109 + # Plot metrics summary
  110 + accuracy = []
  111 + precision = []
  112 + recall = []
  113 + f1 = []
  114 + g = Gnuplot.Gnuplot()
  115 + g('set style data lines')
  116 + g.xlabel('Recommendation size')
  117 + for size in range(1,len(recommendation.ranking)+1,100):
  118 + predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
  119 + real = RecommendationResult(sample)
  120 + evaluation = Evaluation(predicted,real,self.repo_size)
  121 + accuracy.append([size,evaluation.run(Accuracy())])
  122 + precision.append([size,evaluation.run(Precision())])
  123 + recall.append([size,evaluation.run(Recall())])
  124 + f1.append([size,evaluation.run(F1())])
  125 + g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
  126 + Gnuplot.Data(precision,title="Precision"),
  127 + Gnuplot.Data(recall,title="Recall"),
  128 + Gnuplot.Data(f1,title="F1"))
  129 + g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1)
  130 + # Iteration log
  131 + result = {'iteration': n,
  132 + 'weight': params['weight'],
  133 + 'strategy': params['strategy'],
  134 + 'accuracy': accuracy[20],
  135 + 'precision': precision[20],
  136 + 'recall:': recall[20],
  137 + 'f1': f1[20]}
  138 + return result
  139 +
  140 +#class CollaborativeSuite(expsuite.PyExperimentSuite):
  141 +# def reset(self, params, rep):
  142 +# if params['name'].startswith("collaborative"):
  143 +#
  144 +# def iterate(self, params, rep, n):
  145 +# if params['name'].startswith("collaborative"):
  146 +# for root, dirs, files in os.walk(self.source_dir):
  147 +# for popcon_file in files:
  148 +# submission = PopconSubmission(os.path.join(root,popcon_file))
  149 +# user = User(submission.packages)
  150 +# user.maximal_pkg_profile()
  151 +# rec.get_recommendation(user)
  152 +# precision = 0
  153 +# result = {'weight': params['weight'],
  154 +# 'strategy': params['strategy'],
  155 +# 'profile_size': self.profile_size[n],
  156 +# 'accuracy': accuracy,
  157 +# 'precision': precision,
  158 +# 'recall:': recall,
  159 +# 'f1': }
  160 +# else:
  161 +# result = {}
  162 +# return result
  163 +
  164 +if __name__ == '__main__':
  165 +
  166 + if "clustering" in sys.argv or len(sys.argv)<3:
  167 + ClusteringSuite().start()
  168 + if "content" in sys.argv or len(sys.argv)<3:
  169 + ContentBasedSuite().start()
  170 + #if "collaborative" in sys.argv or len(sys.argv)<3:
  171 + #CollaborativeSuite().start()
... ...
src/experiments/popcon-population.py 0 → 100755
... ... @@ -0,0 +1,74 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + misc_popcon - misc experiments with popcon data
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import Gnuplot
  23 +import xapian
  24 +import os
  25 +import random
  26 +import sys
  27 +
  28 +def get_population_profile(popcon):
  29 + profiles_size = []
  30 + for n in range(1,popcon.get_doccount()):
  31 + user = popcon.get_document(n)
  32 + pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")]
  33 + if len(pkgs_profile)<10:
  34 + print "-- profile<10:",user.get_data()
  35 + profiles_size.append(len(pkgs_profile))
  36 + max_profile = max(profiles_size)
  37 + population_profile = [(n,profiles_size.count(n))
  38 + for n in range(max_profile+1)
  39 + if profiles_size.count(n)>0 ]
  40 + return population_profile,max_profile
  41 +
  42 +def get_profile_ranges(population_profile,max_profile,popcon_size):
  43 + ranges = range(0,251,50)
  44 + ranges.append(max_profile)
  45 + ranges_population = []
  46 + ranges_percentage = []
  47 + for maximum in ranges[1:]:
  48 + minimum = ranges[ranges.index(maximum)-1]
  49 + valid = [x[1] for x in population_profile
  50 + if x[0]>minimum and x[0]<=maximum]
  51 + ranges_population.append((maximum,sum(valid)))
  52 + ranges_percentage.append((maximum,sum(valid)/float(popcon_size)))
  53 + return ranges_population,ranges_percentage
  54 +
  55 +def plot(data,xlabel,ylabel,output):
  56 + g = Gnuplot.Gnuplot()
  57 + g('set style data points')
  58 + g.xlabel(xlabel)
  59 + g.ylabel(ylabel)
  60 + g.plot(data)
  61 + g.hardcopy(output+".png", terminal="png")
  62 + g.hardcopy(output+".ps", terminal="postscript", enhanced=1, color=1)
  63 +
  64 +if __name__ == '__main__':
  65 + popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps"))
  66 + print ("Popcon repository size: %d" % popcon.get_doccount())
  67 +
  68 + profile_population,max_profile = get_population_profile(popcon)
  69 + ranges_population,ranges_percentage = get_profile_ranges(profile_population,
  70 + max_profile,popcon.get_doccount())
  71 + print "Population per profile range (up to index)"
  72 + print ranges_population
  73 + plot(profile_population,"Desktop profile size","Population size",
  74 + "results/misc-popcon/profile_population")
... ...
src/experiments/pure.py 0 → 100755
... ... @@ -0,0 +1,199 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + profile-suite - experiment different profile sizes
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +if __name__ == '__main__':
  35 + if len(sys.argv)<2:
  36 + print "Usage: profile-suite strategy_category sample_file"
  37 + exit(1)
  38 +
  39 + iterations = 20
  40 + profile_size = [10,20,40,70,100,140,170,200,240]
  41 + neighbor_size = [3,5,10,50,100,150,200,300,400,500]
  42 +
  43 + content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  44 + collaborative_strategies = ['knn_eset']#,'knn_eset','knn_plus']
  45 + #collaborative_strategies = ['knn','knn_eset','knn_plus']
  46 +
  47 + #iterations = 1
  48 + #profile_size = [10,20,30]
  49 + #neighbor_size = [10,20,30]
  50 + #content_strategies = ['cb']
  51 + #collaborative_strategies = ['knn_eset']
  52 +
  53 + strategy_category = sys.argv[1]
  54 + if strategy_category == "content":
  55 + strategies = content_strategies
  56 + sizes = profile_size
  57 + option_str = "profile"
  58 + elif strategy_category == "collaborative":
  59 + strategies = collaborative_strategies
  60 + sizes = neighbor_size
  61 + option_str = "neighborhood"
  62 + else:
  63 + print "Usage: profile-suite strategy_category sample_file"
  64 + exit(1)
  65 +
  66 + cfg = Config()
  67 + population_sample = []
  68 + sample_file = sys.argv[2]
  69 + sample_str = sample_file.split('/')[-1]
  70 + with open(sample_file,'r') as f:
  71 + for line in f.readlines():
  72 + user_id = line.strip('\n')
  73 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  74 + sample_dir = ("results/%s/%s" %
  75 + (strategy_category,sample_str))
  76 + if not os.path.exists(sample_dir):
  77 + os.makedirs(sample_dir)
  78 +
  79 + for strategy in strategies:
  80 + cfg.strategy = strategy
  81 + p_20_summary = {}
  82 + f05_100_summary = {}
  83 + c_20 = {}
  84 + c_100 = {}
  85 +
  86 + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
  87 + graph_20 = log_file+"-20.png"
  88 + graph_100 = log_file+"-100.png"
  89 + graph_20_jpg = graph_20.strip(".png")+".jpg"
  90 + graph_100_jpg = graph_100.strip(".png")+".jpg"
  91 + comment_20 = graph_20_jpg+".comment"
  92 + comment_100 = graph_100_jpg+".comment"
  93 +
  94 + with open(comment_20,'w') as f:
  95 + f.write("# sample %s\n" % sample_str)
  96 + f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
  97 + (cfg.strategy,iterations))
  98 + f.write("# %s\tp_20\tc_20\n\n"%option_str)
  99 + with open(comment_100,'w') as f:
  100 + f.write("# sample %s\n" % sample_str)
  101 + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
  102 + (cfg.strategy,iterations))
  103 + f.write("# %s\t\tf05_100\t\tc_100\n\n"%option_str)
  104 +
  105 + for size in sizes:
  106 + c_20[size] = set()
  107 + c_100[size] = set()
  108 + p_20_summary[size] = []
  109 + f05_100_summary[size] = []
  110 + with open(log_file+"-%s%.3d"%(option_str,size),'w') as f:
  111 + f.write("# sample %s\n" % sample_str)
  112 + f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size))
  113 + f.write("# p_20\tf05_100\n\n")
  114 +
  115 + # main loop per user
  116 + for submission_file in population_sample:
  117 + user = PopconSystem(submission_file)
  118 + user.filter_pkg_profile(cfg.pkgs_filter)
  119 + user.maximal_pkg_profile()
  120 + for size in sizes:
  121 + cfg.profile_size = size
  122 + cfg.k_neighbors = size
  123 + rec = Recommender(cfg)
  124 + repo_size = rec.items_repository.get_doccount()
  125 + p_20 = []
  126 + f05_100 = []
  127 + for n in range(iterations):
  128 + # Fill sample profile
  129 + profile_len = len(user.pkg_profile)
  130 + item_score = {}
  131 + for pkg in user.pkg_profile:
  132 + item_score[pkg] = user.item_score[pkg]
  133 + sample = {}
  134 + sample_size = int(profile_len*0.9)
  135 + for i in range(sample_size):
  136 + key = random.choice(item_score.keys())
  137 + sample[key] = item_score.pop(key)
  138 + iteration_user = User(item_score)
  139 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  140 + if hasattr(recommendation,"ranking"):
  141 + ranking = recommendation.ranking
  142 + real = RecommendationResult(sample)
  143 + predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
  144 + evaluation = Evaluation(predicted_20,real,repo_size)
  145 + p_20.append(evaluation.run(Precision()))
  146 + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
  147 + evaluation = Evaluation(predicted_100,real,repo_size)
  148 + f05_100.append(evaluation.run(F_score(0.5)))
  149 + c_20[size] = c_20[size].union(recommendation.ranking[:20])
  150 + c_100[size] = c_100[size].union(recommendation.ranking[:100])
  151 + # save summary
  152 + if p_20:
  153 + p_20_summary[size].append(sum(p_20)/len(p_20))
  154 + if f05_100:
  155 + f05_100_summary[size].append(sum(f05_100)/len(f05_100))
  156 +
  157 + with open(log_file+"-%s%.3d"%(option_str,size),'a') as f:
  158 + f.write("%.4f \t%.4f\n" %
  159 + ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
  160 +
  161 + # back to main flow
  162 + coverage_20 = {}
  163 + coverage_100 = {}
  164 + with open(comment_20,'a') as f:
  165 + for size in sizes:
  166 + coverage_20[size] = len(c_20[size])/float(repo_size)
  167 + f.write("%3d\t\t%.4f\t\t%.4f\n" %
  168 + (size,float(sum(p_20_summary[size]))/len(p_20_summary[size]),coverage_20[size]))
  169 + with open(comment_100,'a') as f:
  170 + for size in sizes:
  171 + coverage_100[size] = len(c_100[size])/float(repo_size)
  172 + f.write("%3d\t\t%.4f\t\t%.4f\n" %
  173 + (size,float(sum(f05_100_summary[size]))/len(f05_100_summary[size]),coverage_100[size]))
  174 +
  175 + # plot results summary
  176 + g = Gnuplot.Gnuplot()
  177 + g('set style data lines')
  178 + g('set yrange [0:1.0]')
  179 + g.xlabel('%s size'%option_str.capitalize())
  180 + g.title("Setup: %s (threshold 20)" % cfg.strategy)
  181 + g.plot(Gnuplot.Data(sorted([[k,sum(p_20_summary[k])/len(p_20_summary[k])]
  182 + for k in p_20_summary.keys()]),title="Precision"),
  183 + Gnuplot.Data(sorted([[k,coverage_20[k]]
  184 + for k in coverage_20.keys()]),title="Coverage"))
  185 + g.hardcopy(graph_20,terminal="png")
  186 + commands.getoutput("convert -quality 20 %s %s" %
  187 + (graph_100,graph_20_jpg))
  188 + g = Gnuplot.Gnuplot()
  189 + g('set style data lines')
  190 + g('set yrange [0:1.0]')
  191 + g.xlabel('%s size'%option_str.capitalize())
  192 + g.title("Setup: %s (threshold 100)" % cfg.strategy)
  193 + g.plot(Gnuplot.Data(sorted([[k,sum(f05_100_summary[k])/len(f05_100_summary[k])]
  194 + for k in f05_100_summary.keys()]),title="F05"),
  195 + Gnuplot.Data(sorted([[k,coverage_100[k]]
  196 + for k in coverage_100.keys()]),title="Coverage"))
  197 + g.hardcopy(graph_100,terminal="png")
  198 + commands.getoutput("convert -quality 100 %s %s" %
  199 + (graph_100,graph_100_jpg))
... ...
src/experiments/roc-suite.py 0 → 100755
... ... @@ -0,0 +1,231 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + recommender suite - recommender experiments suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +#iterations = 3
  35 +#sample_proportions = [0.9]
  36 +#weighting = [('bm25',1.2)]
  37 +#collaborative = ['knn_eset']
  38 +#content_based = ['cb']
  39 +#hybrid = ['knnco']
  40 +#profile_size = [50,100]
  41 +#popcon_size = ["1000"]
  42 +#neighbors = [50]
  43 +
  44 +iterations = 30
  45 +sample_proportions = [0.9]
  46 +weighting = [('bm25',1.0)]
  47 +content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  48 +collaborative = ['knn_eset','knn','knn_plus']
  49 +hybrid = ['knnco','knnco_eset']
  50 +profile_size = range(20,200,40)
  51 +neighbors = range(10,510,50)
  52 +
  53 +def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
  54 + # Write recall log
  55 + output = open(("%s-%.2d" % (log_file,n)),'w')
  56 + output.write("# %s-n\n" % label["description"])
  57 + output.write("# %s-%.2d\n" % (label["values"],n))
  58 + output.write("\n# repository profile sample\n%d %d %d\n" % \
  59 + (repo_size,profile_size,len(sample)))
  60 + if hasattr(recommendation,"ranking"):
  61 + notfound = []
  62 + ranks = []
  63 + for pkg in sample.keys():
  64 + if pkg in recommendation.ranking:
  65 + ranks.append(recommendation.ranking.index(pkg))
  66 + else:
  67 + notfound.append(pkg)
  68 + for r in sorted(ranks):
  69 + output.write(str(r)+"\n")
  70 + if notfound:
  71 + output.write("# out of recommendation:\n")
  72 + for pkg in notfound:
  73 + output.write(pkg+"\n")
  74 + output.close()
  75 +
  76 +def plot_roc(roc_points,eauc,c,p,log_file):
  77 + g = Gnuplot.Gnuplot()
  78 + g('set style data lines')
  79 + g.xlabel('False Positive Rate')
  80 + g.ylabel('True Positive Rate')
  81 + g('set xrange [0:1.0]')
  82 + g('set yrange [0:1.0]')
  83 + g.title("Setup: %s" % log_file.split("/")[-1])
  84 + g('set label "C %.2f" at 0.8,0.25' % c)
  85 + g('set label "P(20) %.2f" at 0.8,0.2' % p)
  86 + g('set label "AUC %.4f" at 0.8,0.15' % eauc)
  87 + g.plot(Gnuplot.Data(roc_points,title="ROC"),
  88 + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
  89 + #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
  90 + g.hardcopy(log_file+"-roc.png",terminal="png")
  91 + g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
  92 +
  93 +def get_label(cfg,sample_proportion):
  94 + label = {}
  95 + if cfg.strategy in content_based:
  96 + label["description"] = "strategy-profile"
  97 + label["values"] = ("%s-profile%.3d" %
  98 + (cfg.strategy,cfg.profile_size))
  99 + elif cfg.strategy in collaborative:
  100 + label["description"] = "strategy-knn"
  101 + label["values"] = ("%s-k%.3d" %
  102 + (cfg.strategy,cfg.k_neighbors))
  103 + elif cfg.strategy in hybrid:
  104 + label["description"] = "strategy-knn-profile"
  105 + label["values"] = ("%s-k%.3d-profile%.3d" %
  106 + (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
  107 + else:
  108 + print "Unknown strategy"
  109 + return label
  110 +
  111 +class ExperimentResults:
  112 + def __init__(self,repo_size):
  113 + self.repository_size = repo_size
  114 + self.precision = {}
  115 + self.recall = {}
  116 + self.fpr = {}
  117 + points = [1]+range(10,self.repository_size,10)
  118 + self.recommended = set()
  119 + for size in points:
  120 + self.precision[size] = []
  121 + self.recall[size] = []
  122 + self.fpr[size] = []
  123 +
  124 + def add_result(self,ranking,sample):
  125 + self.recommended = self.recommended.union(ranking)
  126 + # get data only for point
  127 + for size in self.precision.keys():
  128 + predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
  129 + real = RecommendationResult(sample)
  130 + evaluation = Evaluation(predicted,real,self.repository_size)
  131 + self.precision[size].append(evaluation.run(Precision()))
  132 + self.recall[size].append(evaluation.run(Recall()))
  133 + self.fpr[size].append(evaluation.run(FPR()))
  134 +
  135 + # Average ROC by threshold (= size of recommendation)
  136 + def get_roc_points(self):
  137 + points = []
  138 + for size in self.recall.keys():
  139 + tpr = self.recall[size]
  140 + fpr = self.fpr[size]
  141 + points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
  142 + return sorted(points)
  143 +
  144 +def run_strategy(cfg,user):
  145 + for weight in weighting:
  146 + cfg.weight = weight[0]
  147 + cfg.bm25_k1 = weight[1]
  148 + rec = Recommender(cfg)
  149 + repo_size = rec.items_repository.get_doccount()
  150 + for proportion in sample_proportions:
  151 + results = ExperimentResults(repo_size)
  152 + label = get_label(cfg,proportion)
  153 + user_dir = ("results/roc-suite/%s" % user.user_id[:8])
  154 + if not os.path.exists(user_dir):
  155 + os.mkdir(user_dir)
  156 + log_file = os.path.join(user_dir,label["values"])
  157 + for n in range(iterations):
  158 + # Fill sample profile
  159 + profile_len = len(user.pkg_profile)
  160 + item_score = {}
  161 + for pkg in user.pkg_profile:
  162 + item_score[pkg] = user.item_score[pkg]
  163 + sample = {}
  164 + sample_size = int(profile_len*proportion)
  165 + for i in range(sample_size):
  166 + key = random.choice(item_score.keys())
  167 + sample[key] = item_score.pop(key)
  168 + iteration_user = User(item_score)
  169 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  170 + write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
  171 + if hasattr(recommendation,"ranking"):
  172 + results.add_result(recommendation.ranking,sample)
  173 + with open(log_file,'w') as f:
  174 + roc_points = results.get_roc_points()
  175 + x_coord = [p[0] for p in roc_points]
  176 + y_coord = [p[1] for p in roc_points]
  177 + auc = numpy.trapz(y=y_coord, x=x_coord)
  178 + eauc = (auc+
  179 + numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
  180 + numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
  181 + precision_20 = sum(results.precision[10])/len(results.precision[10])
  182 + coverage = len(results.recommended)/float(repo_size)
  183 + f.write("# %s\n# %s\n\n" %
  184 + (label["description"],label["values"]))
  185 + f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
  186 + (coverage,precision_20,auc,eauc))
  187 + plot_roc(roc_points,eauc,coverage,precision_20,log_file)
  188 +
  189 +def run_content(user,cfg):
  190 + for strategy in content_based:
  191 + cfg.strategy = strategy
  192 + for size in profile_size:
  193 + cfg.profile_size = size
  194 + run_strategy(cfg,user)
  195 +
  196 +def run_collaborative(user,cfg):
  197 + popcon_desktopapps = cfg.popcon_desktopapps
  198 + popcon_programs = cfg.popcon_programs
  199 + for strategy in collaborative:
  200 + cfg.strategy = strategy
  201 + for k in neighbors:
  202 + cfg.k_neighbors = k
  203 + run_strategy(cfg,user)
  204 +
  205 +def run_hybrid(user,cfg):
  206 + popcon_desktopapps = cfg.popcon_desktopapps
  207 + popcon_programs = cfg.popcon_programs
  208 + for strategy in hybrid:
  209 + cfg.strategy = strategy
  210 + for k in neighbors:
  211 + cfg.k_neighbors = k
  212 + for size in profile_size:
  213 + cfg.profile_size = size
  214 + run_strategy(cfg,user)
  215 +
  216 +if __name__ == '__main__':
  217 + if len(sys.argv)<2:
  218 + print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]"
  219 + exit(1)
  220 +
  221 + cfg = Config()
  222 + user = PopconSystem(sys.argv[1])
  223 + user.filter_pkg_profile(cfg.pkgs_filter)
  224 + user.maximal_pkg_profile()
  225 +
  226 + if "content" in sys.argv or len(sys.argv)<3:
  227 + run_content(user,cfg)
  228 + if "collaborative" in sys.argv or len(sys.argv)<3:
  229 + run_collaborative(user,cfg)
  230 + if "hybrid" in sys.argv or len(sys.argv)<3:
  231 + run_hybrid(user,cfg)
... ...
src/experiments/sample-popcon-arch.py 0 → 100755
... ... @@ -0,0 +1,44 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + sample-popcon-arch - extract a sample of a specific arch
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +import sys
  22 +sys.path.insert(0,'../')
  23 +import xapian
  24 +import os
  25 +import random
  26 +import sys
  27 +from user import RandomPopcon
  28 +
  29 +if __name__ == '__main__':
  30 + try:
  31 + size = int(sys.argv[1])
  32 + arch = sys.argv[2]
  33 + popcon_dir = sys.argv[3]
  34 + pkgs_filter = sys.argv[4]
  35 + except:
  36 + print "Usage: sample-popcon-arch size arch popcon_dir pkgs_filter"
  37 + exit(1)
  38 +
  39 + sample_file = ("results/misc-popcon/sample-%s-%d" % (arch,size))
  40 + with open(sample_file,'w') as f:
  41 + for n in range(1,size+1):
  42 + user = RandomPopcon(popcon_dir,arch,pkgs_filter)
  43 + f.write(user.user_id+'\n')
  44 + print "sample",n
... ...
src/experiments/sample-popcon.py 0 → 100755
... ... @@ -0,0 +1,53 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + sample-popcon - extract a sample from popcon population
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import xapian
  23 +import os
  24 +import random
  25 +import sys
  26 +
  27 +def extract_sample(size,popcon,min_profile,max_profile,output):
  28 + sample = []
  29 + for n in range(1,popcon.get_doccount()+1):
  30 + user = popcon.get_document(n)
  31 + pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")]
  32 + print len(pkgs_profile)
  33 + if len(pkgs_profile)>min_profile and len(pkgs_profile)<=max_profile:
  34 + sample.append(user.get_data())
  35 + print n,len(sample)
  36 + if len(sample)==size:
  37 + break
  38 + with open(("%s-%d-%d"%(output,min_profile,max_profile)),'w') as f:
  39 + for s in sample:
  40 + f.write(s+'\n')
  41 +
  42 +if __name__ == '__main__':
  43 + popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps"))
  44 + print ("Popcon repository size: %d" % popcon.get_doccount())
  45 + try:
  46 + min_profile = int(sys.argv[1])
  47 + max_profile = int(sys.argv[2])
  48 + size = int(sys.argv[3])
  49 + except:
  50 + print "Usage: sample-popcon min_profile max_profile sample_size"
  51 + exit(1)
  52 + sample_file = "results/misc-popcon/sample"
  53 + extract_sample(size,popcon,min_profile,max_profile,sample_file)
... ...
src/recommender.py
... ... @@ -75,20 +75,20 @@ class Recommender:
75 75 """
76 76 self.cfg = cfg
77 77 # Load xapian indexes
78   - self.axi_programs = xapian.Database(cfg.axi_programs)
  78 + #self.axi_programs = xapian.Database(cfg.axi_programs)
79 79 self.axi_desktopapps = xapian.Database(cfg.axi_desktopapps)
80 80 if cfg.popcon:
81   - self.popcon_programs = xapian.Database(cfg.popcon_programs)
  81 + #self.popcon_programs = xapian.Database(cfg.popcon_programs)
82 82 self.popcon_desktopapps = xapian.Database(cfg.popcon_desktopapps)
83 83 # Load valid programs, desktopapps and tags
84 84 # format: one package or tag name per line
85   - self.valid_programs = []
  85 + #self.valid_programs = []
86 86 self.valid_desktopapps = []
87 87 self.valid_tags = []
88 88 logging.info("Loading recommender filters")
89   - with open(os.path.join(cfg.filters_dir,"programs")) as pkgs:
90   - self.valid_programs = [line.strip() for line in pkgs
91   - if not line.startswith("#")]
  89 + #with open(os.path.join(cfg.filters_dir,"programs")) as pkgs:
  90 + # self.valid_programs = [line.strip() for line in pkgs
  91 + # if not line.startswith("#")]
92 92 with open(os.path.join(cfg.filters_dir,"desktopapps")) as pkgs:
93 93 self.valid_desktopapps = [line.strip() for line in pkgs
94 94 if not line.startswith("#")]
... ... @@ -109,19 +109,21 @@ class Recommender:
109 109 Set the recommendation strategy.
110 110 """
111 111 logging.info("Setting recommender strategy to \'%s\'" % strategy_str)
112   - if self.cfg.pkgs_filter.split("/")[-1] == "desktopapps":
113   - self.items_repository = self.axi_desktopapps
114   - self.valid_pkgs = self.valid_desktopapps
115   - else:
116   - self.items_repository = self.axi_programs
117   - self.valid_pkgs = self.valid_programs
118 112 # Check if collaborative strategies can be instanciated
119   - if ("col" in strategy_str) or ("knn" in strategy_str):
  113 + if "knn" in strategy_str:
120 114 if not self.cfg.popcon:
121 115 logging.info("Cannot perform collaborative strategy")
122 116 return 1
123   - else:
124   - self.users_repository = self.popcon_programs
  117 + #if self.cfg.pkgs_filter.split("/")[-1] == "desktopapps":
  118 + self.items_repository = self.axi_desktopapps
  119 + self.valid_pkgs = self.valid_desktopapps
  120 + if "knn" in strategy_str:
  121 + self.users_repository = self.popcon_desktopapps
  122 + #else:
  123 + # self.items_repository = self.axi_programs
  124 + # self.valid_pkgs = self.valid_programs
  125 + # if "knn" in strategy_str:
  126 + # self.users_repository = self.popcon_programs
125 127 # Set strategy based on strategy_str
126 128 if strategy_str == "cb":
127 129 self.strategy = strategy.ContentBased("mix",self.cfg.profile_size)
... ... @@ -151,8 +153,9 @@ class Recommender:
151 153 self.strategy = strategy.KnnContent(self.cfg.k_neighbors)
152 154 elif strategy_str == "knnco_eset":
153 155 self.strategy = strategy.KnnContentEset(self.cfg.k_neighbors)
154   - elif strategy_str.startswith("demo"):
155   - self.strategy = strategy.Demographic(strategy_str)
  156 + # [FIXME: fix repository instanciation]
  157 + #elif strategy_str.startswith("demo"):
  158 + # self.strategy = strategy.Demographic(strategy_str)
156 159 else:
157 160 logging.info("Strategy not defined.")
158 161 return
... ...
src/strategy.py
... ... @@ -20,6 +20,7 @@ __license__ = &quot;&quot;&quot;
20 20 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 21 """
22 22  
  23 +import os
23 24 import xapian
24 25 from singleton import Singleton
25 26 import recommender
... ...
src/user.py
... ... @@ -111,7 +111,7 @@ class User:
111 111 """
112 112 Define a user of a recommender.
113 113 """
114   - def __init__(self,item_score,user_id=0,demo_profiles_set=0):
  114 + def __init__(self,item_score,user_id=0,arch=0,demo_profiles_set=0):
115 115 """
116 116 Set initial user attributes. pkg_profile gets the whole set of items,
117 117 a random user_id is set if none was provided and the demographic
... ... @@ -119,6 +119,7 @@ class User:
119 119 """
120 120 self.item_score = item_score
121 121 self.pkg_profile = self.items()
  122 + self.arch = arch
122 123  
123 124 if user_id:
124 125 self.user_id = user_id
... ... @@ -272,21 +273,28 @@ class User:
272 273 return self.pkg_profile
273 274  
274 275 class RandomPopcon(User):
275   - def __init__(self,submissions_dir,pkgs_filter=0):
  276 + def __init__(self,submissions_dir,arch=0,pkgs_filter=0):
276 277 """
277 278 Set initial parameters.
278 279 """
279 280 len_profile = 0
280   - while len_profile < 100:
  281 + match_arch = False
  282 + while len_profile < 100 or not match_arch:
281 283 path = random.choice([os.path.join(root, submission) for
282 284 root, dirs, files in os.walk(submissions_dir)
283 285 for submission in files])
284 286 user = PopconSystem(path)
  287 + print arch
  288 + print user.arch
  289 + if arch and user.arch==arch:
  290 + match_arch = True
  291 + print "match"
285 292 if pkgs_filter:
286 293 user.filter_pkg_profile(pkgs_filter)
287 294 len_profile = len(user.pkg_profile)
  295 + print "p",len_profile
288 296 submission = data.PopconSubmission(path)
289   - User.__init__(self,submission.packages,submission.user_id)
  297 + User.__init__(self,submission.packages,submission.user_id,submission.arch)
290 298  
291 299 class PopconSystem(User):
292 300 def __init__(self,path,user_id=0):
... ... @@ -296,7 +304,7 @@ class PopconSystem(User):
296 304 submission = data.PopconSubmission(path)
297 305 if not user_id:
298 306 user_id = submission.user_id
299   - User.__init__(self,submission.packages,user_id)
  307 + User.__init__(self,submission.packages,user_id,submission.arch)
300 308  
301 309 class PkgsListSystem(User):
302 310 def __init__(self,pkgs_list_or_file,user_id=0):
... ...
src/web/templates/survey.html 100644 → 100755
... ... @@ -36,7 +36,7 @@ button below.
36 36 </div>
37 37  
38 38  
39   -<form action="/save" method="post" enctype="multipart/form-data" name="surveyform">
  39 +<form action="save" method="post" enctype="multipart/form-data" name="surveyform">
40 40  
41 41 <input type="hidden" name="user_id" value=$request.user.user_id>
42 42 <input type="hidden" name="strategy" value=$request.strategy>
... ...