Commit 1aed15a54b9f4f80ee0f25dc45088fb558661765
1 parent
5af15ae1
Exists in
master
and in
1 other branch
User profiling methods refactoring.
Showing
1 changed file
with
101 additions
and
39 deletions
Show diff stats
src/user.py
@@ -29,16 +29,29 @@ import apt | @@ -29,16 +29,29 @@ import apt | ||
29 | from error import Error | 29 | from error import Error |
30 | from singleton import Singleton | 30 | from singleton import Singleton |
31 | import data | 31 | import data |
32 | +import operator | ||
33 | +import math | ||
32 | 34 | ||
33 | class FilterTag(xapian.ExpandDecider): | 35 | class FilterTag(xapian.ExpandDecider): |
34 | """ | 36 | """ |
35 | Extend xapian.ExpandDecider to consider only tag terms. | 37 | Extend xapian.ExpandDecider to consider only tag terms. |
36 | """ | 38 | """ |
39 | + def __init__(self, valid_tags): | ||
40 | + """ | ||
41 | + Set initial parameters. | ||
42 | + """ | ||
43 | + xapian.ExpandDecider.__init__(self) | ||
44 | + self.valid_tags = valid_tags | ||
45 | + | ||
37 | def __call__(self, term): | 46 | def __call__(self, term): |
38 | """ | 47 | """ |
39 | Return true if the term is a tag, else false. | 48 | Return true if the term is a tag, else false. |
40 | """ | 49 | """ |
41 | - return term.startswith("XT") | 50 | + if self.valid_tags: |
51 | + is_valid = term.lstrip("XT") in self.valid_tags | ||
52 | + else: | ||
53 | + is_valid = 1 | ||
54 | + return term.startswith("XT") and is_valid | ||
42 | 55 | ||
43 | class FilterDescription(xapian.ExpandDecider): | 56 | class FilterDescription(xapian.ExpandDecider): |
44 | """ | 57 | """ |
@@ -51,6 +64,26 @@ class FilterDescription(xapian.ExpandDecider): | @@ -51,6 +64,26 @@ class FilterDescription(xapian.ExpandDecider): | ||
51 | """ | 64 | """ |
52 | return term.islower() or term.startswith("Z") | 65 | return term.islower() or term.startswith("Z") |
53 | 66 | ||
67 | +class FilterTag_or_Description(xapian.ExpandDecider): | ||
68 | + """ | ||
69 | + Extend xapian.ExpandDecider to consider only package description terms. | ||
70 | + """ | ||
71 | + def __init__(self, valid_tags): | ||
72 | + """ | ||
73 | + Set initial parameters. | ||
74 | + """ | ||
75 | + xapian.ExpandDecider.__init__(self) | ||
76 | + self.valid_tags = valid_tags | ||
77 | + | ||
78 | + def __call__(self, term): | ||
79 | + """ | ||
80 | + Return true if the term or its stemmed version is part of a package | ||
81 | + description. | ||
82 | + """ | ||
83 | + is_tag = FilterTag(self.valid_tags)(term) | ||
84 | + is_description = FilterDescription()(term) | ||
85 | + return is_tag or is_description | ||
86 | + | ||
54 | class DemographicProfile(Singleton): | 87 | class DemographicProfile(Singleton): |
55 | def __init__(self): | 88 | def __init__(self): |
56 | self.admin = set(["admin", "hardware", "mail", "protocol", | 89 | self.admin = set(["admin", "hardware", "mail", "protocol", |
@@ -109,57 +142,83 @@ class User: | @@ -109,57 +142,83 @@ class User: | ||
109 | """ | 142 | """ |
110 | self.demographic_profile = DemographicProfile()(profiles_set) | 143 | self.demographic_profile = DemographicProfile()(profiles_set) |
111 | 144 | ||
112 | - def content_profile(self,items_repository,content,size): | 145 | + def content_profile(self,items_repository,content,size,valid_tags=0): |
113 | """ | 146 | """ |
114 | Get user profile for a specific type of content: packages tags, | 147 | Get user profile for a specific type of content: packages tags, |
115 | - description or both (full_profile) | 148 | + description or both (mixed and half-half profiles) |
116 | """ | 149 | """ |
117 | if content == "tag": | 150 | if content == "tag": |
118 | - profile = self.tag_profile(items_repository,size) | ||
119 | - if content == "desc": | ||
120 | - profile = self.desc_profile(items_repository,size) | ||
121 | - if content == "full": | ||
122 | - profile = self.full_profile(items_repository,size) | ||
123 | - logging.debug("User profile: %s" % profile) | 151 | + profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags)) |
152 | + elif content == "desc": | ||
153 | + profile = self.tfidf_profile(items_repository,size,FilterDescription()) | ||
154 | + elif content == "mix": | ||
155 | + profile = self.tfidf_profile(items_repository,size,FilterTag_or_Description(valid_tags)) | ||
156 | + elif content == "half": | ||
157 | + tag_profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags)) | ||
158 | + desc_profile = self.tfidf_profile(items_repository,size,FilterDescription()) | ||
159 | + profile = tag_profile[:size/2]+desc_profile[:size/2] | ||
160 | + elif content == "tag_eset": | ||
161 | + profile = self.eset_profile(items_repository,size,FilterTag(valid_tags)) | ||
162 | + elif content == "desc_eset": | ||
163 | + profile = self.eset_profile(items_repository,size,FilterDescription()) | ||
164 | + elif content == "mix_eset": | ||
165 | + profile = self.eset_profile(items_repository,size,FilterTag_or_Description(valid_tags)) | ||
166 | + elif content == "half_eset": | ||
167 | + tag_profile = self.eset_profile(items_repository,size,FilterTag(valid_tags)) | ||
168 | + desc_profile = self.eset_profile(items_repository,size,FilterDescription()) | ||
169 | + profile = tag_profile[:size/2]+desc_profile[:size/2] | ||
170 | + else: | ||
171 | + logging.debug("Unknown content type %s." % content) | ||
172 | + raise Error | ||
173 | + logging.debug("User %s profile: %s" % (content, profile)) | ||
124 | return profile | 174 | return profile |
125 | 175 | ||
126 | - def tag_profile(self,items_repository,size): | 176 | + def tfidf_profile(self,items_repository,size,content_filter): |
127 | """ | 177 | """ |
128 | - Return most relevant tags for a list of packages. | 178 | + Return the most relevant tags for the user list of packages based on |
179 | + the sublinear tfidf weight of packages' tags. | ||
129 | """ | 180 | """ |
130 | - enquire = xapian.Enquire(items_repository) | ||
131 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) | 181 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) |
132 | - rset_packages = xapian.RSet() | ||
133 | - for docid in docs: | ||
134 | - rset_packages.add_document(docid) | ||
135 | - # statistically good differentiators | ||
136 | - eset_tags = enquire.get_eset(size, rset_packages, FilterTag()) | ||
137 | - profile = [res.term for res in eset_tags] | 182 | + #weights = data.tfidf_plus(items_repository,docs,content_filter) |
183 | + weights = data.tfidf_weighting(items_repository,docs,content_filter) | ||
184 | + # Eliminate duplicated stemmed term | ||
185 | + profile = self._eliminate_duplicated([w[0] for w in weights],size) | ||
138 | return profile | 186 | return profile |
139 | 187 | ||
140 | - def desc_profile(self,items_repository,size): | 188 | + def eset_profile(self,items_repository,size,content_filter): |
141 | """ | 189 | """ |
142 | - Return most relevant keywords for a list of packages based on their | ||
143 | - text descriptions. | 190 | + Return most relevant tags for a list of packages. |
144 | """ | 191 | """ |
192 | + # Store package documents in a relevant set | ||
145 | enquire = xapian.Enquire(items_repository) | 193 | enquire = xapian.Enquire(items_repository) |
146 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) | 194 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) |
147 | rset_packages = xapian.RSet() | 195 | rset_packages = xapian.RSet() |
148 | - for docid in docs: | ||
149 | - rset_packages.add_document(docid) | ||
150 | - eset_keywords = enquire.get_eset(size, rset_packages, | ||
151 | - FilterDescription()) | ||
152 | - profile = [res.term for res in eset_keywords] | 196 | + for d in docs: |
197 | + rset_packages.add_document(d.docid) | ||
198 | + # Get expanded query terms (statistically good differentiators) | ||
199 | + eset_tags = enquire.get_eset(size*2,rset_packages, | ||
200 | + xapian.Enquire.INCLUDE_QUERY_TERMS, | ||
201 | + 1,content_filter) | ||
202 | + # Eliminate duplicated stemmed term | ||
203 | + profile = self._eliminate_duplicated([res.term for res in eset_tags],size) | ||
153 | return profile | 204 | return profile |
154 | 205 | ||
155 | - def full_profile(self,items_repository,size): | ||
156 | - """ | ||
157 | - Return most relevant tags and keywords for a list of packages based | ||
158 | - their tags and descriptions. | ||
159 | - """ | ||
160 | - tag_profile = self.tag_profile(items_repository,size)[:size/2] | ||
161 | - desc_profile = self.desc_profile(items_repository,size)[:size/2] | ||
162 | - return tag_profile+desc_profile | 206 | + def _eliminate_duplicated(self,sorted_list,size): |
207 | + profile = sorted_list[:size] | ||
208 | + next_index = size | ||
209 | + duplicate = 1 | ||
210 | + while duplicate: | ||
211 | + duplicate = 0 | ||
212 | + for term in profile[:]: | ||
213 | + if term.startswith("Z"): | ||
214 | + for p in profile[:]: | ||
215 | + if p.startswith(term.lstrip("Z")): | ||
216 | + duplicate = 1 | ||
217 | + profile.remove(p) | ||
218 | + if len(sorted_list)>next_index: | ||
219 | + profile.append(sorted_list[next_index]) | ||
220 | + next_index +=1 | ||
221 | + return profile | ||
163 | 222 | ||
164 | def filter_pkg_profile(self,filter_list_or_file): | 223 | def filter_pkg_profile(self,filter_list_or_file): |
165 | """ | 224 | """ |
@@ -172,7 +231,8 @@ class User: | @@ -172,7 +231,8 @@ class User: | ||
172 | with open(filter_list_or_file) as valid: | 231 | with open(filter_list_or_file) as valid: |
173 | valid_pkgs = [line.strip() for line in valid] | 232 | valid_pkgs = [line.strip() for line in valid] |
174 | except IOError: | 233 | except IOError: |
175 | - logging.critical("Could not open profile filter file.") | 234 | + logging.critical("Could not open profile filter file: %" % |
235 | + filter_list_or_file) | ||
176 | raise Error | 236 | raise Error |
177 | else: | 237 | else: |
178 | logging.debug("No filter provided for user profiling.") | 238 | logging.debug("No filter provided for user profiling.") |
@@ -229,15 +289,17 @@ class RandomPopcon(User): | @@ -229,15 +289,17 @@ class RandomPopcon(User): | ||
229 | User.__init__(self,submission.packages,submission.user_id) | 289 | User.__init__(self,submission.packages,submission.user_id) |
230 | 290 | ||
231 | class PopconSystem(User): | 291 | class PopconSystem(User): |
232 | - def __init__(self,path): | 292 | + def __init__(self,path,user_id=0): |
233 | """ | 293 | """ |
234 | Set initial parameters. | 294 | Set initial parameters. |
235 | """ | 295 | """ |
236 | submission = data.PopconSubmission(path) | 296 | submission = data.PopconSubmission(path) |
237 | - User.__init__(self,submission.packages,submission.user_id) | 297 | + if not user_id: |
298 | + user_id = submission.user_id | ||
299 | + User.__init__(self,submission.packages,user_id) | ||
238 | 300 | ||
239 | class PkgsListSystem(User): | 301 | class PkgsListSystem(User): |
240 | - def __init__(self,pkgs_list_or_file): | 302 | + def __init__(self,pkgs_list_or_file,user_id=0): |
241 | """ | 303 | """ |
242 | Set initial parameters. | 304 | Set initial parameters. |
243 | """ | 305 | """ |
@@ -254,7 +316,7 @@ class PkgsListSystem(User): | @@ -254,7 +316,7 @@ class PkgsListSystem(User): | ||
254 | logging.debug("No packages provided for user profiling.") | 316 | logging.debug("No packages provided for user profiling.") |
255 | return self.pkg_profile | 317 | return self.pkg_profile |
256 | 318 | ||
257 | - User.__init__(self,dict.fromkeys(pkgs_list,1)) | 319 | + User.__init__(self,dict.fromkeys(pkgs_list,1),user_id) |
258 | 320 | ||
259 | class LocalSystem(User): | 321 | class LocalSystem(User): |
260 | """ | 322 | """ |