Commit 1aed15a54b9f4f80ee0f25dc45088fb558661765
1 parent
5af15ae1
Exists in
master
and in
1 other branch
User profiling methods refactoring.
Showing
1 changed file
with
101 additions
and
39 deletions
Show diff stats
src/user.py
... | ... | @@ -29,16 +29,29 @@ import apt |
29 | 29 | from error import Error |
30 | 30 | from singleton import Singleton |
31 | 31 | import data |
32 | +import operator | |
33 | +import math | |
32 | 34 | |
33 | 35 | class FilterTag(xapian.ExpandDecider): |
34 | 36 | """ |
35 | 37 | Extend xapian.ExpandDecider to consider only tag terms. |
36 | 38 | """ |
39 | + def __init__(self, valid_tags): | |
40 | + """ | |
41 | + Set initial parameters. | |
42 | + """ | |
43 | + xapian.ExpandDecider.__init__(self) | |
44 | + self.valid_tags = valid_tags | |
45 | + | |
37 | 46 | def __call__(self, term): |
38 | 47 | """ |
39 | 48 | Return true if the term is a tag, else false. |
40 | 49 | """ |
41 | - return term.startswith("XT") | |
50 | + if self.valid_tags: | |
51 | + is_valid = term.lstrip("XT") in self.valid_tags | |
52 | + else: | |
53 | + is_valid = 1 | |
54 | + return term.startswith("XT") and is_valid | |
42 | 55 | |
43 | 56 | class FilterDescription(xapian.ExpandDecider): |
44 | 57 | """ |
... | ... | @@ -51,6 +64,26 @@ class FilterDescription(xapian.ExpandDecider): |
51 | 64 | """ |
52 | 65 | return term.islower() or term.startswith("Z") |
53 | 66 | |
67 | +class FilterTag_or_Description(xapian.ExpandDecider): | |
68 | + """ | |
69 | + Extend xapian.ExpandDecider to consider only package description terms. | |
70 | + """ | |
71 | + def __init__(self, valid_tags): | |
72 | + """ | |
73 | + Set initial parameters. | |
74 | + """ | |
75 | + xapian.ExpandDecider.__init__(self) | |
76 | + self.valid_tags = valid_tags | |
77 | + | |
78 | + def __call__(self, term): | |
79 | + """ | |
80 | + Return true if the term or its stemmed version is part of a package | |
81 | + description. | |
82 | + """ | |
83 | + is_tag = FilterTag(self.valid_tags)(term) | |
84 | + is_description = FilterDescription()(term) | |
85 | + return is_tag or is_description | |
86 | + | |
54 | 87 | class DemographicProfile(Singleton): |
55 | 88 | def __init__(self): |
56 | 89 | self.admin = set(["admin", "hardware", "mail", "protocol", |
... | ... | @@ -109,57 +142,83 @@ class User: |
109 | 142 | """ |
110 | 143 | self.demographic_profile = DemographicProfile()(profiles_set) |
111 | 144 | |
112 | - def content_profile(self,items_repository,content,size): | |
145 | + def content_profile(self,items_repository,content,size,valid_tags=0): | |
113 | 146 | """ |
114 | 147 | Get user profile for a specific type of content: packages tags, |
115 | - description or both (full_profile) | |
148 | + description or both (mixed and half-half profiles) | |
116 | 149 | """ |
117 | 150 | if content == "tag": |
118 | - profile = self.tag_profile(items_repository,size) | |
119 | - if content == "desc": | |
120 | - profile = self.desc_profile(items_repository,size) | |
121 | - if content == "full": | |
122 | - profile = self.full_profile(items_repository,size) | |
123 | - logging.debug("User profile: %s" % profile) | |
151 | + profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags)) | |
152 | + elif content == "desc": | |
153 | + profile = self.tfidf_profile(items_repository,size,FilterDescription()) | |
154 | + elif content == "mix": | |
155 | + profile = self.tfidf_profile(items_repository,size,FilterTag_or_Description(valid_tags)) | |
156 | + elif content == "half": | |
157 | + tag_profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags)) | |
158 | + desc_profile = self.tfidf_profile(items_repository,size,FilterDescription()) | |
159 | + profile = tag_profile[:size/2]+desc_profile[:size/2] | |
160 | + elif content == "tag_eset": | |
161 | + profile = self.eset_profile(items_repository,size,FilterTag(valid_tags)) | |
162 | + elif content == "desc_eset": | |
163 | + profile = self.eset_profile(items_repository,size,FilterDescription()) | |
164 | + elif content == "mix_eset": | |
165 | + profile = self.eset_profile(items_repository,size,FilterTag_or_Description(valid_tags)) | |
166 | + elif content == "half_eset": | |
167 | + tag_profile = self.eset_profile(items_repository,size,FilterTag(valid_tags)) | |
168 | + desc_profile = self.eset_profile(items_repository,size,FilterDescription()) | |
169 | + profile = tag_profile[:size/2]+desc_profile[:size/2] | |
170 | + else: | |
171 | + logging.debug("Unknown content type %s." % content) | |
172 | + raise Error | |
173 | + logging.debug("User %s profile: %s" % (content, profile)) | |
124 | 174 | return profile |
125 | 175 | |
126 | - def tag_profile(self,items_repository,size): | |
176 | + def tfidf_profile(self,items_repository,size,content_filter): | |
127 | 177 | """ |
128 | - Return most relevant tags for a list of packages. | |
178 | + Return the most relevant tags for the user list of packages based on | |
179 | + the sublinear tfidf weight of packages' tags. | |
129 | 180 | """ |
130 | - enquire = xapian.Enquire(items_repository) | |
131 | 181 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) |
132 | - rset_packages = xapian.RSet() | |
133 | - for docid in docs: | |
134 | - rset_packages.add_document(docid) | |
135 | - # statistically good differentiators | |
136 | - eset_tags = enquire.get_eset(size, rset_packages, FilterTag()) | |
137 | - profile = [res.term for res in eset_tags] | |
182 | + #weights = data.tfidf_plus(items_repository,docs,content_filter) | |
183 | + weights = data.tfidf_weighting(items_repository,docs,content_filter) | |
184 | + # Eliminate duplicated stemmed term | |
185 | + profile = self._eliminate_duplicated([w[0] for w in weights],size) | |
138 | 186 | return profile |
139 | 187 | |
140 | - def desc_profile(self,items_repository,size): | |
188 | + def eset_profile(self,items_repository,size,content_filter): | |
141 | 189 | """ |
142 | - Return most relevant keywords for a list of packages based on their | |
143 | - text descriptions. | |
190 | + Return most relevant tags for a list of packages. | |
144 | 191 | """ |
192 | + # Store package documents in a relevant set | |
145 | 193 | enquire = xapian.Enquire(items_repository) |
146 | 194 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) |
147 | 195 | rset_packages = xapian.RSet() |
148 | - for docid in docs: | |
149 | - rset_packages.add_document(docid) | |
150 | - eset_keywords = enquire.get_eset(size, rset_packages, | |
151 | - FilterDescription()) | |
152 | - profile = [res.term for res in eset_keywords] | |
196 | + for d in docs: | |
197 | + rset_packages.add_document(d.docid) | |
198 | + # Get expanded query terms (statistically good differentiators) | |
199 | + eset_tags = enquire.get_eset(size*2,rset_packages, | |
200 | + xapian.Enquire.INCLUDE_QUERY_TERMS, | |
201 | + 1,content_filter) | |
202 | + # Eliminate duplicated stemmed term | |
203 | + profile = self._eliminate_duplicated([res.term for res in eset_tags],size) | |
153 | 204 | return profile |
154 | 205 | |
155 | - def full_profile(self,items_repository,size): | |
156 | - """ | |
157 | - Return most relevant tags and keywords for a list of packages based | |
158 | - their tags and descriptions. | |
159 | - """ | |
160 | - tag_profile = self.tag_profile(items_repository,size)[:size/2] | |
161 | - desc_profile = self.desc_profile(items_repository,size)[:size/2] | |
162 | - return tag_profile+desc_profile | |
206 | + def _eliminate_duplicated(self,sorted_list,size): | |
207 | + profile = sorted_list[:size] | |
208 | + next_index = size | |
209 | + duplicate = 1 | |
210 | + while duplicate: | |
211 | + duplicate = 0 | |
212 | + for term in profile[:]: | |
213 | + if term.startswith("Z"): | |
214 | + for p in profile[:]: | |
215 | + if p.startswith(term.lstrip("Z")): | |
216 | + duplicate = 1 | |
217 | + profile.remove(p) | |
218 | + if len(sorted_list)>next_index: | |
219 | + profile.append(sorted_list[next_index]) | |
220 | + next_index +=1 | |
221 | + return profile | |
163 | 222 | |
164 | 223 | def filter_pkg_profile(self,filter_list_or_file): |
165 | 224 | """ |
... | ... | @@ -172,7 +231,8 @@ class User: |
172 | 231 | with open(filter_list_or_file) as valid: |
173 | 232 | valid_pkgs = [line.strip() for line in valid] |
174 | 233 | except IOError: |
175 | - logging.critical("Could not open profile filter file.") | |
234 | + logging.critical("Could not open profile filter file: %" % | |
235 | + filter_list_or_file) | |
176 | 236 | raise Error |
177 | 237 | else: |
178 | 238 | logging.debug("No filter provided for user profiling.") |
... | ... | @@ -229,15 +289,17 @@ class RandomPopcon(User): |
229 | 289 | User.__init__(self,submission.packages,submission.user_id) |
230 | 290 | |
231 | 291 | class PopconSystem(User): |
232 | - def __init__(self,path): | |
292 | + def __init__(self,path,user_id=0): | |
233 | 293 | """ |
234 | 294 | Set initial parameters. |
235 | 295 | """ |
236 | 296 | submission = data.PopconSubmission(path) |
237 | - User.__init__(self,submission.packages,submission.user_id) | |
297 | + if not user_id: | |
298 | + user_id = submission.user_id | |
299 | + User.__init__(self,submission.packages,user_id) | |
238 | 300 | |
239 | 301 | class PkgsListSystem(User): |
240 | - def __init__(self,pkgs_list_or_file): | |
302 | + def __init__(self,pkgs_list_or_file,user_id=0): | |
241 | 303 | """ |
242 | 304 | Set initial parameters. |
243 | 305 | """ |
... | ... | @@ -254,7 +316,7 @@ class PkgsListSystem(User): |
254 | 316 | logging.debug("No packages provided for user profiling.") |
255 | 317 | return self.pkg_profile |
256 | 318 | |
257 | - User.__init__(self,dict.fromkeys(pkgs_list,1)) | |
319 | + User.__init__(self,dict.fromkeys(pkgs_list,1),user_id) | |
258 | 320 | |
259 | 321 | class LocalSystem(User): |
260 | 322 | """ | ... | ... |