Commit ea86d6ae4b509b088b98111c4d795f77eb046133
1 parent
bc5f760c
Exists in
master
and in
1 other branch
[data]
- axi_search_pkgs() returns docids instead of matches - popcon indexing considers pkgs filters [evaluation] - Added comments do cross-validation - cross_item_score now represents items_score (with respective ratings) [recommender] - Defined some more new strategies [strategies] - Now uses profile_size provided by config [user] - content_profile() replaced profile() - filter_pkg_profile() replaced app_pkg_profile() - new classes RandomPopcon and PopconSystem
Showing
5 changed files
with
128 additions
and
52 deletions
Show diff stats
src/data.py
... | ... | @@ -31,6 +31,7 @@ import shutil |
31 | 31 | from error import Error |
32 | 32 | from singleton import Singleton |
33 | 33 | from dissimilarity import * |
34 | +from config import Config | |
34 | 35 | |
35 | 36 | def axi_search_pkgs(axi,pkgs_list): |
36 | 37 | terms = ["XP"+item for item in pkgs_list] |
... | ... | @@ -38,19 +39,22 @@ def axi_search_pkgs(axi,pkgs_list): |
38 | 39 | enquire = xapian.Enquire(axi) |
39 | 40 | enquire.set_query(query) |
40 | 41 | matches = enquire.get_mset(0,axi.get_doccount()) |
41 | - return matches | |
42 | + return [m.docid for m in matches] | |
42 | 43 | |
43 | 44 | def axi_search_pkg_tags(axi,pkg): |
44 | 45 | enquire = xapian.Enquire(axi) |
45 | 46 | enquire.set_query(xapian.Query("XP"+pkg)) |
46 | 47 | matches = enquire.get_mset(0,1) |
47 | 48 | if not matches: |
48 | - #logging.debug("Package %s not found in items repository" % pkg) | |
49 | - return [] | |
49 | + logging.debug("Package %s not found in items repository" % pkg) | |
50 | + return False | |
50 | 51 | for m in matches: |
51 | 52 | tags = [term.term for term in axi.get_document(m.docid).termlist() if |
52 | 53 | term.term.startswith("XT")] |
53 | - return tags | |
54 | + if not tags: | |
55 | + return "notags" | |
56 | + else: | |
57 | + return tags | |
54 | 58 | |
55 | 59 | def print_index(index): |
56 | 60 | output = "\n---\n" + xapian.Database.__repr__(index) + "\n---\n" |
... | ... | @@ -96,7 +100,7 @@ class SampleAptXapianIndex(xapian.WritableDatabase): |
96 | 100 | xapian.DB_CREATE_OR_OVERWRITE) |
97 | 101 | sample = axi_search_pkgs(axi,pkgs_list) |
98 | 102 | for package in sample: |
99 | - doc_id = self.add_document(axi.get_document(package.docid)) | |
103 | + doc_id = self.add_document(axi.get_document(package)) | |
100 | 104 | |
101 | 105 | def __str__(self): |
102 | 106 | return print_index(self) |
... | ... | @@ -115,6 +119,14 @@ class PopconSubmission(): |
115 | 119 | output += "\n "+pkg+": "+str(weight) |
116 | 120 | return output |
117 | 121 | |
122 | + def apps(self,axi): | |
123 | + apps = {} | |
124 | + for pkg in self.packages.keys(): | |
125 | + tags = axi_search_pkg_tags(self.axi,pkg) | |
126 | + if "XTrole::program" in tags: | |
127 | + apps[pkg] = self.packages[pkg] | |
128 | + return apps | |
129 | + | |
118 | 130 | def load(self,binary=1): |
119 | 131 | """ |
120 | 132 | Parse a popcon submission, generating the names of the valid packages |
... | ... | @@ -159,6 +171,16 @@ class PopconXapianIndex(xapian.WritableDatabase): |
159 | 171 | self.path = os.path.expanduser(cfg.popcon_index) |
160 | 172 | self.source_dir = os.path.expanduser(cfg.popcon_dir) |
161 | 173 | self.max_popcon = cfg.max_popcon |
174 | + self.valid_pkgs = [] | |
175 | + # file format: one pkg_name per line | |
176 | + with open(os.path.join(cfg.filters,cfg.pkgs_filter)) as valid_pkgs: | |
177 | + self.valid_pkgs = [line.strip() for line in valid_pkgs | |
178 | + if not line.startswith("#")] | |
179 | + logging.debug("Considering %d valid packages" % len(self.valid_pkgs)) | |
180 | + with open(os.path.join(cfg.filters,"tags")) as valid_tags: | |
181 | + self.valid_tags = [line.strip() for line in valid_tags | |
182 | + if not line.startswith("#")] | |
183 | + logging.debug("Considering %d valid tags" % len(self.valid_tags)) | |
162 | 184 | if not cfg.index_mode == "old" or not self.load_index(): |
163 | 185 | if not os.path.exists(cfg.popcon_dir): |
164 | 186 | os.makedirs(cfg.popcon_dir) |
... | ... | @@ -243,10 +265,16 @@ class PopconXapianIndex(xapian.WritableDatabase): |
243 | 265 | logging.debug("Parsing popcon submission \'%s\'" % |
244 | 266 | submission.user_id) |
245 | 267 | for pkg, freq in submission.packages.items(): |
246 | - doc.add_term("XP"+pkg,freq) | |
247 | - #if axi_search_pkg_tags(self.axi,pkg): | |
248 | - # for tag in axi_search_pkg_tags(self.axi,pkg): | |
249 | - # doc.add_term(tag,freq) | |
268 | + if pkg in self.valid_pkgs: | |
269 | + tags = axi_search_pkg_tags(self.axi,pkg) | |
270 | + # if the package was foung in axi | |
271 | + if tags: | |
272 | + doc.add_term("XP"+pkg,freq) | |
273 | + # if the package has tags associated with it | |
274 | + if not tags == "notags": | |
275 | + for tag in tags: | |
276 | + if tag in self.valid_tags: | |
277 | + doc.add_term(tag,freq) | |
250 | 278 | doc_id = self.add_document(doc) |
251 | 279 | doc_count += 1 |
252 | 280 | logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) |
... | ... | @@ -256,7 +284,7 @@ class PopconXapianIndex(xapian.WritableDatabase): |
256 | 284 | try: |
257 | 285 | self.commit() |
258 | 286 | except: |
259 | - self.flush() # deprecated function, used for old lib version | |
287 | + self.flush() # deprecated function, used for compatibility with old lib version | |
260 | 288 | |
261 | 289 | def get_submissions(self,submissions_dir): |
262 | 290 | """ |
... | ... | @@ -288,9 +316,7 @@ class KMedoidsClustering(cluster.KMeansClustering): |
288 | 316 | data_sample = data |
289 | 317 | else: |
290 | 318 | data_sample = random.sample(data,max_data) |
291 | - print data_sample | |
292 | 319 | cluster.KMeansClustering.__init__(self, data_sample, distance) |
293 | - # cluster.KMeansClustering.__init__(self, data, distance) | |
294 | 320 | self.distanceMatrix = {} |
295 | 321 | for submission in self._KMeansClustering__data: |
296 | 322 | self.distanceMatrix[submission.user_id] = {} | ... | ... |
src/evaluation.py
... | ... | @@ -25,6 +25,7 @@ import random |
25 | 25 | from collections import defaultdict |
26 | 26 | import logging |
27 | 27 | |
28 | +from error import Error | |
28 | 29 | from user import * |
29 | 30 | from recommender import * |
30 | 31 | from singleton import Singleton |
... | ... | @@ -271,11 +272,15 @@ class CrossValidation: |
271 | 272 | """ |
272 | 273 | Perform cross-validation. |
273 | 274 | """ |
274 | - # | |
275 | - cross_item_score = dict.fromkeys(user.pkg_profile,1) | |
275 | + # Extracting user profile scores from cross validation | |
276 | + cross_item_score = {} | |
277 | + for pkg in user.pkg_profile: | |
278 | + cross_item_score[pkg] = user.item_score[pkg] | |
276 | 279 | partition_size = int(len(cross_item_score)*self.partition_proportion) |
280 | + # main iteration | |
277 | 281 | for r in range(self.rounds): |
278 | 282 | round_partition = {} |
283 | + # move items from cross_item_score to round-partition | |
279 | 284 | for j in range(partition_size): |
280 | 285 | if len(cross_item_score)>0: |
281 | 286 | random_key = random.choice(cross_item_score.keys()) |
... | ... | @@ -283,20 +288,25 @@ class CrossValidation: |
283 | 288 | logging.critical("Empty cross_item_score.") |
284 | 289 | raise Error |
285 | 290 | round_partition[random_key] = cross_item_score.pop(random_key) |
286 | - #logging.debug("Round partition: %s",str(round_partition)) | |
287 | - #logging.debug("Cross item-score: %s",str(cross_item_score)) | |
291 | + logging.debug("Round partition: %s",str(round_partition)) | |
292 | + logging.debug("Cross item-score: %s",str(cross_item_score)) | |
293 | + # round user is created with remaining items | |
288 | 294 | round_user = User(cross_item_score) |
289 | 295 | result_size = int(self.recommender.items_repository.get_doccount()* |
290 | 296 | self.result_proportion) |
291 | 297 | predicted_result = self.recommender.get_recommendation(round_user,result_size) |
292 | - #print len(round_partition) | |
298 | + if not predicted_result.size: | |
299 | + logging.critical("No recommendation produced. Abort cross-validation.") | |
300 | + raise Error | |
301 | + # partition is considered the expected result | |
293 | 302 | real_result = RecommendationResult(round_partition) |
294 | - #logging.debug("Predicted result: %s",predicted_result) | |
303 | + logging.debug("Predicted result: %s",predicted_result) | |
295 | 304 | evaluation = Evaluation(predicted_result,real_result, |
296 | 305 | self.recommender.items_repository.get_doccount()) |
297 | 306 | for metric in self.metrics_list: |
298 | 307 | result = evaluation.run(metric) |
299 | 308 | self.cross_results[metric.desc].append(result) |
309 | + # moving back items from round_partition to cross_item_score | |
300 | 310 | while len(round_partition)>0: |
301 | 311 | item,score = round_partition.popitem() |
302 | 312 | cross_item_score[item] = score | ... | ... |
src/recommender.py
... | ... | @@ -78,15 +78,23 @@ class Recommender: |
78 | 78 | """ |
79 | 79 | Set the recommendation strategy. |
80 | 80 | """ |
81 | - if strategy_str == "cb": | |
82 | - self.strategy = strategy.ContentBasedStrategy("full") | |
83 | - if strategy_str == "cbt": | |
84 | - self.strategy = strategy.ContentBasedStrategy("tag") | |
85 | - if strategy_str == "cbd": | |
86 | - self.strategy = strategy.ContentBasedStrategy("desc") | |
87 | - if strategy_str == "col": | |
81 | + self.items_repository = xapian.Database(self.cfg.axi) | |
82 | + if "desktop" in strategy_str: | |
83 | + self.items_repository = xapian.Database("/root/.app-recommender/DesktopAxi") | |
84 | + self.cfg.popcon_index = "/root/.app-recommender/popcon-index_desktop_1000" | |
85 | + | |
86 | + if strategy_str == "cb" or strategy_str == "cb_desktop": | |
87 | + self.strategy = strategy.ContentBasedStrategy("full", | |
88 | + self.cfg.profile_size) | |
89 | + if strategy_str == "cbt" or strategy_str == "cbt_desktop": | |
90 | + self.strategy = strategy.ContentBasedStrategy("tag", | |
91 | + self.cfg.profile_size) | |
92 | + if strategy_str == "cbd" or strategy_str == "cbd_desktop": | |
93 | + self.strategy = strategy.ContentBasedStrategy("desc", | |
94 | + self.cfg.profile_size) | |
95 | + if "col" in strategy_str: | |
88 | 96 | self.users_repository = data.PopconXapianIndex(self.cfg) |
89 | - self.strategy = strategy.CollaborativeStrategy(20) | |
97 | + self.strategy = strategy.CollaborativeStrategy(self.cfg.k_neighbors) | |
90 | 98 | |
91 | 99 | def get_recommendation(self,user,result_size=100): |
92 | 100 | """ | ... | ... |
src/strategy.py
... | ... | @@ -140,7 +140,7 @@ class ContentBasedStrategy(RecommendationStrategy): |
140 | 140 | """ |
141 | 141 | Content-based recommendation strategy based on Apt-xapian-index. |
142 | 142 | """ |
143 | - def __init__(self,content,profile_size=50): | |
143 | + def __init__(self,content,profile_size): | |
144 | 144 | self.description = "Content-based" |
145 | 145 | self.content = content |
146 | 146 | self.profile_size = profile_size |
... | ... | @@ -149,8 +149,8 @@ class ContentBasedStrategy(RecommendationStrategy): |
149 | 149 | """ |
150 | 150 | Perform recommendation strategy. |
151 | 151 | """ |
152 | - profile = user.profile(rec.items_repository,self.content, | |
153 | - self.profile_size) | |
152 | + profile = user.content_profile(rec.items_repository,self.content, | |
153 | + self.profile_size) | |
154 | 154 | # prepair index for querying user profile |
155 | 155 | query = xapian.Query(xapian.Query.OP_OR,profile) |
156 | 156 | enquire = xapian.Enquire(rec.items_repository) |
... | ... | @@ -188,7 +188,8 @@ class CollaborativeStrategy(RecommendationStrategy): |
188 | 188 | """ |
189 | 189 | Perform recommendation strategy. |
190 | 190 | """ |
191 | - profile = ["XP"+package for package in user.pkg_profile] | |
191 | + profile = ["XP"+package for package in | |
192 | + user.filter_pkg_profile("/root/.app-recommender/filters/program")] | |
192 | 193 | # prepair index for querying user profile |
193 | 194 | query = xapian.Query(xapian.Query.OP_OR,profile) |
194 | 195 | enquire = xapian.Enquire(rec.users_repository) |
... | ... | @@ -210,13 +211,15 @@ class CollaborativeStrategy(RecommendationStrategy): |
210 | 211 | eset = enquire.get_eset(recommendation_size,rset,PkgExpandDecider()) |
211 | 212 | # compose result dictionary |
212 | 213 | item_score = {} |
214 | + ranking = [] | |
213 | 215 | for e in eset: |
214 | 216 | package = e.term.lstrip("XP") |
215 | 217 | tags = axi_search_pkg_tags(rec.items_repository,package) |
216 | 218 | #[FIXME] set this constraint somehow |
217 | 219 | #if "XTrole::program" in tags: |
218 | 220 | item_score[package] = e.weight |
219 | - return recommender.RecommendationResult(item_score) | |
221 | + ranking.append(m.document.get_data()) | |
222 | + return recommender.RecommendationResult(item_score, ranking) | |
220 | 223 | |
221 | 224 | class DemographicStrategy(RecommendationStrategy): |
222 | 225 | """ | ... | ... |
src/user.py
... | ... | @@ -19,8 +19,10 @@ __license__ = """ |
19 | 19 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | 20 | """ |
21 | 21 | |
22 | +import os | |
22 | 23 | import random |
23 | 24 | import commands |
25 | +import datetime | |
24 | 26 | import xapian |
25 | 27 | import logging |
26 | 28 | import apt |
... | ... | @@ -43,9 +45,10 @@ class FilterDescription(xapian.ExpandDecider): |
43 | 45 | """ |
44 | 46 | def __call__(self, term): |
45 | 47 | """ |
46 | - Return true if the term is a tag, else false. | |
48 | + Return true if the term or its stemmed version is part of a package | |
49 | + description. | |
47 | 50 | """ |
48 | - return term.islower() #or term.startswith("Z") | |
51 | + return term.islower() or term.startswith("Z") | |
49 | 52 | |
50 | 53 | class DemographicProfile(Singleton): |
51 | 54 | def __init__(self): |
... | ... | @@ -84,7 +87,7 @@ class User: |
84 | 87 | self.pkg_profile = self.items() |
85 | 88 | |
86 | 89 | if user_id: |
87 | - self.id = user_id | |
90 | + self.user_id = user_id | |
88 | 91 | else: |
89 | 92 | random.seed() |
90 | 93 | self.id = random.getrandbits(128) |
... | ... | @@ -105,7 +108,7 @@ class User: |
105 | 108 | """ |
106 | 109 | self.demographic_profile = DemographicProfile()(profiles_set) |
107 | 110 | |
108 | - def profile(self,items_repository,content,size): | |
111 | + def content_profile(self,items_repository,content,size): | |
109 | 112 | """ |
110 | 113 | Get user profile for a specific type of content: packages tags, |
111 | 114 | description or both (full_profile) |
... | ... | @@ -119,10 +122,10 @@ class User: |
119 | 122 | Return most relevant tags for a list of packages. |
120 | 123 | """ |
121 | 124 | enquire = xapian.Enquire(items_repository) |
122 | - matches = data.axi_search_pkgs(items_repository,self.pkg_profile) | |
125 | + docs = data.axi_search_pkgs(items_repository,self.pkg_profile) | |
123 | 126 | rset_packages = xapian.RSet() |
124 | - for m in matches: | |
125 | - rset_packages.add_document(m.docid) | |
127 | + for docid in docs: | |
128 | + rset_packages.add_document(docid) | |
126 | 129 | # statistically good differentiators |
127 | 130 | eset_tags = enquire.get_eset(size, rset_packages, FilterTag()) |
128 | 131 | profile = [res.term for res in eset_tags] |
... | ... | @@ -134,10 +137,10 @@ class User: |
134 | 137 | text descriptions. |
135 | 138 | """ |
136 | 139 | enquire = xapian.Enquire(items_repository) |
137 | - matches = data.axi_search_pkgs(items_repository,self.pkg_profile) | |
140 | + docs = data.axi_search_pkgs(items_repository,self.pkg_profile) | |
138 | 141 | rset_packages = xapian.RSet() |
139 | - for m in matches: | |
140 | - rset_packages.add_document(m.docid) | |
142 | + for docid in docs: | |
143 | + rset_packages.add_document(docid) | |
141 | 144 | eset_keywords = enquire.get_eset(size, rset_packages, |
142 | 145 | FilterDescription()) |
143 | 146 | profile = [res.term for res in eset_keywords] |
... | ... | @@ -152,21 +155,19 @@ class User: |
152 | 155 | desc_profile = self.desc_profile(items_repository,size)[:size/2] |
153 | 156 | return tag_profile+desc_profile |
154 | 157 | |
155 | - def app_pkg_profile(self,axi): | |
158 | + def filter_pkg_profile(self,filter_file): | |
156 | 159 | """ |
157 | - Return list of packages that are applications. | |
160 | + Return list of packages from profile listed in the filter_file. | |
158 | 161 | """ |
159 | 162 | old_profile_size = len(self.pkg_profile) |
160 | - for p in self.pkg_profile[:]: #iterate list copy | |
161 | - tags = data.axi_search_pkg_tags(axi,p) | |
162 | - try: | |
163 | - | |
164 | - if not "XTrole::program" in tags: | |
165 | - self.pkg_profile.remove(p) | |
166 | - except: | |
167 | - logging.debug("Package not found in axi: %s" % p) | |
163 | + with open(filter_file) as valid: | |
164 | + valid_pkgs = [line.strip() for line in valid] | |
165 | + for pkg in self.pkg_profile[:]: #iterate list copy | |
166 | + if pkg not in valid_pkgs: | |
167 | + self.pkg_profile.remove(pkg) | |
168 | + logging.debug("Discarded package %s during profile filtering" % pkg) | |
168 | 169 | profile_size = len(self.pkg_profile) |
169 | - logging.debug("App package profile: reduced packages profile size \ | |
170 | + logging.debug("Filtered package profile: reduced packages profile size \ | |
170 | 171 | from %d to %d." % (old_profile_size, profile_size)) |
171 | 172 | return self.pkg_profile |
172 | 173 | |
... | ... | @@ -193,6 +194,33 @@ class User: |
193 | 194 | from %d to %d." % (old_profile_size, profile_size)) |
194 | 195 | return self.pkg_profile |
195 | 196 | |
197 | +class RandomPopcon(User): | |
198 | + def __init__(self,submissions_dir,pkgs_filter=0): | |
199 | + """ | |
200 | + Set initial parameters. | |
201 | + """ | |
202 | + item_score = {} | |
203 | + len_profile = 0 | |
204 | + while len_profile < 100: | |
205 | + path = random.choice([os.path.join(root, submission) for | |
206 | + root, dirs, files in os.walk(submissions_dir) | |
207 | + for submission in files]) | |
208 | + user = PopconSystem(path) | |
209 | + if pkgs_filter: | |
210 | + user.filter_pkg_profile(pkgs_filter) | |
211 | + len_profile = len(user.pkg_profile) | |
212 | + submission = data.PopconSubmission(path) | |
213 | + User.__init__(self,submission.packages,submission.user_id) | |
214 | + | |
215 | +class PopconSystem(User): | |
216 | + def __init__(self,path): | |
217 | + """ | |
218 | + Set initial parameters. | |
219 | + """ | |
220 | + item_score = {} | |
221 | + submission = data.PopconSubmission(path) | |
222 | + User.__init__(self,submission.packages,submission.user_id) | |
223 | + | |
196 | 224 | class LocalSystem(User): |
197 | 225 | """ |
198 | 226 | Extend the class User to consider the packages installed on the local |
... | ... | @@ -207,6 +235,7 @@ class LocalSystem(User): |
207 | 235 | for line in dpkg_output.splitlines(): |
208 | 236 | pkg = line.split('\t')[0] |
209 | 237 | item_score[pkg] = 1 |
238 | + self.user_id = "local-"+str(datetime.datetime.now()) | |
210 | 239 | User.__init__(self,item_score) |
211 | 240 | |
212 | 241 | def no_auto_pkg_profile(self): | ... | ... |