Commit ea86d6ae4b509b088b98111c4d795f77eb046133

Authored by Tássia Camões Araújo
1 parent bc5f760c
Exists in master and in 1 other branch add_vagrant

[data]

- axi_search_pkgs() returns docids instead of matches
- popcon indexing considers pkgs filters

[evaluation]
- Added comments do cross-validation
- cross_item_score now represents items_score (with respective ratings)

[recommender]
- Defined some more new strategies

[strategies]
- Now uses profile_size provided by config

[user]
- content_profile() replaced profile()
- filter_pkg_profile() replaced app_pkg_profile()
- new classes RandomPopcon and PopconSystem
src/data.py
... ... @@ -31,6 +31,7 @@ import shutil
31 31 from error import Error
32 32 from singleton import Singleton
33 33 from dissimilarity import *
  34 +from config import Config
34 35  
35 36 def axi_search_pkgs(axi,pkgs_list):
36 37 terms = ["XP"+item for item in pkgs_list]
... ... @@ -38,19 +39,22 @@ def axi_search_pkgs(axi,pkgs_list):
38 39 enquire = xapian.Enquire(axi)
39 40 enquire.set_query(query)
40 41 matches = enquire.get_mset(0,axi.get_doccount())
41   - return matches
  42 + return [m.docid for m in matches]
42 43  
43 44 def axi_search_pkg_tags(axi,pkg):
44 45 enquire = xapian.Enquire(axi)
45 46 enquire.set_query(xapian.Query("XP"+pkg))
46 47 matches = enquire.get_mset(0,1)
47 48 if not matches:
48   - #logging.debug("Package %s not found in items repository" % pkg)
49   - return []
  49 + logging.debug("Package %s not found in items repository" % pkg)
  50 + return False
50 51 for m in matches:
51 52 tags = [term.term for term in axi.get_document(m.docid).termlist() if
52 53 term.term.startswith("XT")]
53   - return tags
  54 + if not tags:
  55 + return "notags"
  56 + else:
  57 + return tags
54 58  
55 59 def print_index(index):
56 60 output = "\n---\n" + xapian.Database.__repr__(index) + "\n---\n"
... ... @@ -96,7 +100,7 @@ class SampleAptXapianIndex(xapian.WritableDatabase):
96 100 xapian.DB_CREATE_OR_OVERWRITE)
97 101 sample = axi_search_pkgs(axi,pkgs_list)
98 102 for package in sample:
99   - doc_id = self.add_document(axi.get_document(package.docid))
  103 + doc_id = self.add_document(axi.get_document(package))
100 104  
101 105 def __str__(self):
102 106 return print_index(self)
... ... @@ -115,6 +119,14 @@ class PopconSubmission():
115 119 output += "\n "+pkg+": "+str(weight)
116 120 return output
117 121  
  122 + def apps(self,axi):
  123 + apps = {}
  124 + for pkg in self.packages.keys():
  125 + tags = axi_search_pkg_tags(self.axi,pkg)
  126 + if "XTrole::program" in tags:
  127 + apps[pkg] = self.packages[pkg]
  128 + return apps
  129 +
118 130 def load(self,binary=1):
119 131 """
120 132 Parse a popcon submission, generating the names of the valid packages
... ... @@ -159,6 +171,16 @@ class PopconXapianIndex(xapian.WritableDatabase):
159 171 self.path = os.path.expanduser(cfg.popcon_index)
160 172 self.source_dir = os.path.expanduser(cfg.popcon_dir)
161 173 self.max_popcon = cfg.max_popcon
  174 + self.valid_pkgs = []
  175 + # file format: one pkg_name per line
  176 + with open(os.path.join(cfg.filters,cfg.pkgs_filter)) as valid_pkgs:
  177 + self.valid_pkgs = [line.strip() for line in valid_pkgs
  178 + if not line.startswith("#")]
  179 + logging.debug("Considering %d valid packages" % len(self.valid_pkgs))
  180 + with open(os.path.join(cfg.filters,"tags")) as valid_tags:
  181 + self.valid_tags = [line.strip() for line in valid_tags
  182 + if not line.startswith("#")]
  183 + logging.debug("Considering %d valid tags" % len(self.valid_tags))
162 184 if not cfg.index_mode == "old" or not self.load_index():
163 185 if not os.path.exists(cfg.popcon_dir):
164 186 os.makedirs(cfg.popcon_dir)
... ... @@ -243,10 +265,16 @@ class PopconXapianIndex(xapian.WritableDatabase):
243 265 logging.debug("Parsing popcon submission \'%s\'" %
244 266 submission.user_id)
245 267 for pkg, freq in submission.packages.items():
246   - doc.add_term("XP"+pkg,freq)
247   - #if axi_search_pkg_tags(self.axi,pkg):
248   - # for tag in axi_search_pkg_tags(self.axi,pkg):
249   - # doc.add_term(tag,freq)
  268 + if pkg in self.valid_pkgs:
  269 + tags = axi_search_pkg_tags(self.axi,pkg)
  270 + # if the package was foung in axi
  271 + if tags:
  272 + doc.add_term("XP"+pkg,freq)
  273 + # if the package has tags associated with it
  274 + if not tags == "notags":
  275 + for tag in tags:
  276 + if tag in self.valid_tags:
  277 + doc.add_term(tag,freq)
250 278 doc_id = self.add_document(doc)
251 279 doc_count += 1
252 280 logging.debug("Popcon Xapian: Indexing doc %d" % doc_id)
... ... @@ -256,7 +284,7 @@ class PopconXapianIndex(xapian.WritableDatabase):
256 284 try:
257 285 self.commit()
258 286 except:
259   - self.flush() # deprecated function, used for old lib version
  287 + self.flush() # deprecated function, used for compatibility with old lib version
260 288  
261 289 def get_submissions(self,submissions_dir):
262 290 """
... ... @@ -288,9 +316,7 @@ class KMedoidsClustering(cluster.KMeansClustering):
288 316 data_sample = data
289 317 else:
290 318 data_sample = random.sample(data,max_data)
291   - print data_sample
292 319 cluster.KMeansClustering.__init__(self, data_sample, distance)
293   - # cluster.KMeansClustering.__init__(self, data, distance)
294 320 self.distanceMatrix = {}
295 321 for submission in self._KMeansClustering__data:
296 322 self.distanceMatrix[submission.user_id] = {}
... ...
src/evaluation.py
... ... @@ -25,6 +25,7 @@ import random
25 25 from collections import defaultdict
26 26 import logging
27 27  
  28 +from error import Error
28 29 from user import *
29 30 from recommender import *
30 31 from singleton import Singleton
... ... @@ -271,11 +272,15 @@ class CrossValidation:
271 272 """
272 273 Perform cross-validation.
273 274 """
274   - #
275   - cross_item_score = dict.fromkeys(user.pkg_profile,1)
  275 + # Extracting user profile scores from cross validation
  276 + cross_item_score = {}
  277 + for pkg in user.pkg_profile:
  278 + cross_item_score[pkg] = user.item_score[pkg]
276 279 partition_size = int(len(cross_item_score)*self.partition_proportion)
  280 + # main iteration
277 281 for r in range(self.rounds):
278 282 round_partition = {}
  283 + # move items from cross_item_score to round-partition
279 284 for j in range(partition_size):
280 285 if len(cross_item_score)>0:
281 286 random_key = random.choice(cross_item_score.keys())
... ... @@ -283,20 +288,25 @@ class CrossValidation:
283 288 logging.critical("Empty cross_item_score.")
284 289 raise Error
285 290 round_partition[random_key] = cross_item_score.pop(random_key)
286   - #logging.debug("Round partition: %s",str(round_partition))
287   - #logging.debug("Cross item-score: %s",str(cross_item_score))
  291 + logging.debug("Round partition: %s",str(round_partition))
  292 + logging.debug("Cross item-score: %s",str(cross_item_score))
  293 + # round user is created with remaining items
288 294 round_user = User(cross_item_score)
289 295 result_size = int(self.recommender.items_repository.get_doccount()*
290 296 self.result_proportion)
291 297 predicted_result = self.recommender.get_recommendation(round_user,result_size)
292   - #print len(round_partition)
  298 + if not predicted_result.size:
  299 + logging.critical("No recommendation produced. Abort cross-validation.")
  300 + raise Error
  301 + # partition is considered the expected result
293 302 real_result = RecommendationResult(round_partition)
294   - #logging.debug("Predicted result: %s",predicted_result)
  303 + logging.debug("Predicted result: %s",predicted_result)
295 304 evaluation = Evaluation(predicted_result,real_result,
296 305 self.recommender.items_repository.get_doccount())
297 306 for metric in self.metrics_list:
298 307 result = evaluation.run(metric)
299 308 self.cross_results[metric.desc].append(result)
  309 + # moving back items from round_partition to cross_item_score
300 310 while len(round_partition)>0:
301 311 item,score = round_partition.popitem()
302 312 cross_item_score[item] = score
... ...
src/recommender.py
... ... @@ -78,15 +78,23 @@ class Recommender:
78 78 """
79 79 Set the recommendation strategy.
80 80 """
81   - if strategy_str == "cb":
82   - self.strategy = strategy.ContentBasedStrategy("full")
83   - if strategy_str == "cbt":
84   - self.strategy = strategy.ContentBasedStrategy("tag")
85   - if strategy_str == "cbd":
86   - self.strategy = strategy.ContentBasedStrategy("desc")
87   - if strategy_str == "col":
  81 + self.items_repository = xapian.Database(self.cfg.axi)
  82 + if "desktop" in strategy_str:
  83 + self.items_repository = xapian.Database("/root/.app-recommender/DesktopAxi")
  84 + self.cfg.popcon_index = "/root/.app-recommender/popcon-index_desktop_1000"
  85 +
  86 + if strategy_str == "cb" or strategy_str == "cb_desktop":
  87 + self.strategy = strategy.ContentBasedStrategy("full",
  88 + self.cfg.profile_size)
  89 + if strategy_str == "cbt" or strategy_str == "cbt_desktop":
  90 + self.strategy = strategy.ContentBasedStrategy("tag",
  91 + self.cfg.profile_size)
  92 + if strategy_str == "cbd" or strategy_str == "cbd_desktop":
  93 + self.strategy = strategy.ContentBasedStrategy("desc",
  94 + self.cfg.profile_size)
  95 + if "col" in strategy_str:
88 96 self.users_repository = data.PopconXapianIndex(self.cfg)
89   - self.strategy = strategy.CollaborativeStrategy(20)
  97 + self.strategy = strategy.CollaborativeStrategy(self.cfg.k_neighbors)
90 98  
91 99 def get_recommendation(self,user,result_size=100):
92 100 """
... ...
src/strategy.py
... ... @@ -140,7 +140,7 @@ class ContentBasedStrategy(RecommendationStrategy):
140 140 """
141 141 Content-based recommendation strategy based on Apt-xapian-index.
142 142 """
143   - def __init__(self,content,profile_size=50):
  143 + def __init__(self,content,profile_size):
144 144 self.description = "Content-based"
145 145 self.content = content
146 146 self.profile_size = profile_size
... ... @@ -149,8 +149,8 @@ class ContentBasedStrategy(RecommendationStrategy):
149 149 """
150 150 Perform recommendation strategy.
151 151 """
152   - profile = user.profile(rec.items_repository,self.content,
153   - self.profile_size)
  152 + profile = user.content_profile(rec.items_repository,self.content,
  153 + self.profile_size)
154 154 # prepair index for querying user profile
155 155 query = xapian.Query(xapian.Query.OP_OR,profile)
156 156 enquire = xapian.Enquire(rec.items_repository)
... ... @@ -188,7 +188,8 @@ class CollaborativeStrategy(RecommendationStrategy):
188 188 """
189 189 Perform recommendation strategy.
190 190 """
191   - profile = ["XP"+package for package in user.pkg_profile]
  191 + profile = ["XP"+package for package in
  192 + user.filter_pkg_profile("/root/.app-recommender/filters/program")]
192 193 # prepair index for querying user profile
193 194 query = xapian.Query(xapian.Query.OP_OR,profile)
194 195 enquire = xapian.Enquire(rec.users_repository)
... ... @@ -210,13 +211,15 @@ class CollaborativeStrategy(RecommendationStrategy):
210 211 eset = enquire.get_eset(recommendation_size,rset,PkgExpandDecider())
211 212 # compose result dictionary
212 213 item_score = {}
  214 + ranking = []
213 215 for e in eset:
214 216 package = e.term.lstrip("XP")
215 217 tags = axi_search_pkg_tags(rec.items_repository,package)
216 218 #[FIXME] set this constraint somehow
217 219 #if "XTrole::program" in tags:
218 220 item_score[package] = e.weight
219   - return recommender.RecommendationResult(item_score)
  221 + ranking.append(m.document.get_data())
  222 + return recommender.RecommendationResult(item_score, ranking)
220 223  
221 224 class DemographicStrategy(RecommendationStrategy):
222 225 """
... ...
src/user.py
... ... @@ -19,8 +19,10 @@ __license__ = """
19 19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 20 """
21 21  
  22 +import os
22 23 import random
23 24 import commands
  25 +import datetime
24 26 import xapian
25 27 import logging
26 28 import apt
... ... @@ -43,9 +45,10 @@ class FilterDescription(xapian.ExpandDecider):
43 45 """
44 46 def __call__(self, term):
45 47 """
46   - Return true if the term is a tag, else false.
  48 + Return true if the term or its stemmed version is part of a package
  49 + description.
47 50 """
48   - return term.islower() #or term.startswith("Z")
  51 + return term.islower() or term.startswith("Z")
49 52  
50 53 class DemographicProfile(Singleton):
51 54 def __init__(self):
... ... @@ -84,7 +87,7 @@ class User:
84 87 self.pkg_profile = self.items()
85 88  
86 89 if user_id:
87   - self.id = user_id
  90 + self.user_id = user_id
88 91 else:
89 92 random.seed()
90 93 self.id = random.getrandbits(128)
... ... @@ -105,7 +108,7 @@ class User:
105 108 """
106 109 self.demographic_profile = DemographicProfile()(profiles_set)
107 110  
108   - def profile(self,items_repository,content,size):
  111 + def content_profile(self,items_repository,content,size):
109 112 """
110 113 Get user profile for a specific type of content: packages tags,
111 114 description or both (full_profile)
... ... @@ -119,10 +122,10 @@ class User:
119 122 Return most relevant tags for a list of packages.
120 123 """
121 124 enquire = xapian.Enquire(items_repository)
122   - matches = data.axi_search_pkgs(items_repository,self.pkg_profile)
  125 + docs = data.axi_search_pkgs(items_repository,self.pkg_profile)
123 126 rset_packages = xapian.RSet()
124   - for m in matches:
125   - rset_packages.add_document(m.docid)
  127 + for docid in docs:
  128 + rset_packages.add_document(docid)
126 129 # statistically good differentiators
127 130 eset_tags = enquire.get_eset(size, rset_packages, FilterTag())
128 131 profile = [res.term for res in eset_tags]
... ... @@ -134,10 +137,10 @@ class User:
134 137 text descriptions.
135 138 """
136 139 enquire = xapian.Enquire(items_repository)
137   - matches = data.axi_search_pkgs(items_repository,self.pkg_profile)
  140 + docs = data.axi_search_pkgs(items_repository,self.pkg_profile)
138 141 rset_packages = xapian.RSet()
139   - for m in matches:
140   - rset_packages.add_document(m.docid)
  142 + for docid in docs:
  143 + rset_packages.add_document(docid)
141 144 eset_keywords = enquire.get_eset(size, rset_packages,
142 145 FilterDescription())
143 146 profile = [res.term for res in eset_keywords]
... ... @@ -152,21 +155,19 @@ class User:
152 155 desc_profile = self.desc_profile(items_repository,size)[:size/2]
153 156 return tag_profile+desc_profile
154 157  
155   - def app_pkg_profile(self,axi):
  158 + def filter_pkg_profile(self,filter_file):
156 159 """
157   - Return list of packages that are applications.
  160 + Return list of packages from profile listed in the filter_file.
158 161 """
159 162 old_profile_size = len(self.pkg_profile)
160   - for p in self.pkg_profile[:]: #iterate list copy
161   - tags = data.axi_search_pkg_tags(axi,p)
162   - try:
163   -
164   - if not "XTrole::program" in tags:
165   - self.pkg_profile.remove(p)
166   - except:
167   - logging.debug("Package not found in axi: %s" % p)
  163 + with open(filter_file) as valid:
  164 + valid_pkgs = [line.strip() for line in valid]
  165 + for pkg in self.pkg_profile[:]: #iterate list copy
  166 + if pkg not in valid_pkgs:
  167 + self.pkg_profile.remove(pkg)
  168 + logging.debug("Discarded package %s during profile filtering" % pkg)
168 169 profile_size = len(self.pkg_profile)
169   - logging.debug("App package profile: reduced packages profile size \
  170 + logging.debug("Filtered package profile: reduced packages profile size \
170 171 from %d to %d." % (old_profile_size, profile_size))
171 172 return self.pkg_profile
172 173  
... ... @@ -193,6 +194,33 @@ class User:
193 194 from %d to %d." % (old_profile_size, profile_size))
194 195 return self.pkg_profile
195 196  
  197 +class RandomPopcon(User):
  198 + def __init__(self,submissions_dir,pkgs_filter=0):
  199 + """
  200 + Set initial parameters.
  201 + """
  202 + item_score = {}
  203 + len_profile = 0
  204 + while len_profile < 100:
  205 + path = random.choice([os.path.join(root, submission) for
  206 + root, dirs, files in os.walk(submissions_dir)
  207 + for submission in files])
  208 + user = PopconSystem(path)
  209 + if pkgs_filter:
  210 + user.filter_pkg_profile(pkgs_filter)
  211 + len_profile = len(user.pkg_profile)
  212 + submission = data.PopconSubmission(path)
  213 + User.__init__(self,submission.packages,submission.user_id)
  214 +
  215 +class PopconSystem(User):
  216 + def __init__(self,path):
  217 + """
  218 + Set initial parameters.
  219 + """
  220 + item_score = {}
  221 + submission = data.PopconSubmission(path)
  222 + User.__init__(self,submission.packages,submission.user_id)
  223 +
196 224 class LocalSystem(User):
197 225 """
198 226 Extend the class User to consider the packages installed on the local
... ... @@ -207,6 +235,7 @@ class LocalSystem(User):
207 235 for line in dpkg_output.splitlines():
208 236 pkg = line.split('\t')[0]
209 237 item_score[pkg] = 1
  238 + self.user_id = "local-"+str(datetime.datetime.now())
210 239 User.__init__(self,item_score)
211 240  
212 241 def no_auto_pkg_profile(self):
... ...