Commit 1aed15a54b9f4f80ee0f25dc45088fb558661765
1 parent
5af15ae1
Exists in
master
and in
1 other branch
User profiling methods refactoring.
Showing
1 changed file
with
101 additions
and
39 deletions
Show diff stats
src/user.py
| ... | ... | @@ -29,16 +29,29 @@ import apt |
| 29 | 29 | from error import Error |
| 30 | 30 | from singleton import Singleton |
| 31 | 31 | import data |
| 32 | +import operator | |
| 33 | +import math | |
| 32 | 34 | |
| 33 | 35 | class FilterTag(xapian.ExpandDecider): |
| 34 | 36 | """ |
| 35 | 37 | Extend xapian.ExpandDecider to consider only tag terms. |
| 36 | 38 | """ |
| 39 | + def __init__(self, valid_tags): | |
| 40 | + """ | |
| 41 | + Set initial parameters. | |
| 42 | + """ | |
| 43 | + xapian.ExpandDecider.__init__(self) | |
| 44 | + self.valid_tags = valid_tags | |
| 45 | + | |
| 37 | 46 | def __call__(self, term): |
| 38 | 47 | """ |
| 39 | 48 | Return true if the term is a tag, else false. |
| 40 | 49 | """ |
| 41 | - return term.startswith("XT") | |
| 50 | + if self.valid_tags: | |
| 51 | + is_valid = term.lstrip("XT") in self.valid_tags | |
| 52 | + else: | |
| 53 | + is_valid = 1 | |
| 54 | + return term.startswith("XT") and is_valid | |
| 42 | 55 | |
| 43 | 56 | class FilterDescription(xapian.ExpandDecider): |
| 44 | 57 | """ |
| ... | ... | @@ -51,6 +64,26 @@ class FilterDescription(xapian.ExpandDecider): |
| 51 | 64 | """ |
| 52 | 65 | return term.islower() or term.startswith("Z") |
| 53 | 66 | |
| 67 | +class FilterTag_or_Description(xapian.ExpandDecider): | |
| 68 | + """ | |
| 69 | + Extend xapian.ExpandDecider to consider only package description terms. | |
| 70 | + """ | |
| 71 | + def __init__(self, valid_tags): | |
| 72 | + """ | |
| 73 | + Set initial parameters. | |
| 74 | + """ | |
| 75 | + xapian.ExpandDecider.__init__(self) | |
| 76 | + self.valid_tags = valid_tags | |
| 77 | + | |
| 78 | + def __call__(self, term): | |
| 79 | + """ | |
| 80 | + Return true if the term or its stemmed version is part of a package | |
| 81 | + description. | |
| 82 | + """ | |
| 83 | + is_tag = FilterTag(self.valid_tags)(term) | |
| 84 | + is_description = FilterDescription()(term) | |
| 85 | + return is_tag or is_description | |
| 86 | + | |
| 54 | 87 | class DemographicProfile(Singleton): |
| 55 | 88 | def __init__(self): |
| 56 | 89 | self.admin = set(["admin", "hardware", "mail", "protocol", |
| ... | ... | @@ -109,57 +142,83 @@ class User: |
| 109 | 142 | """ |
| 110 | 143 | self.demographic_profile = DemographicProfile()(profiles_set) |
| 111 | 144 | |
| 112 | - def content_profile(self,items_repository,content,size): | |
| 145 | + def content_profile(self,items_repository,content,size,valid_tags=0): | |
| 113 | 146 | """ |
| 114 | 147 | Get user profile for a specific type of content: packages tags, |
| 115 | - description or both (full_profile) | |
| 148 | + description or both (mixed and half-half profiles) | |
| 116 | 149 | """ |
| 117 | 150 | if content == "tag": |
| 118 | - profile = self.tag_profile(items_repository,size) | |
| 119 | - if content == "desc": | |
| 120 | - profile = self.desc_profile(items_repository,size) | |
| 121 | - if content == "full": | |
| 122 | - profile = self.full_profile(items_repository,size) | |
| 123 | - logging.debug("User profile: %s" % profile) | |
| 151 | + profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags)) | |
| 152 | + elif content == "desc": | |
| 153 | + profile = self.tfidf_profile(items_repository,size,FilterDescription()) | |
| 154 | + elif content == "mix": | |
| 155 | + profile = self.tfidf_profile(items_repository,size,FilterTag_or_Description(valid_tags)) | |
| 156 | + elif content == "half": | |
| 157 | + tag_profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags)) | |
| 158 | + desc_profile = self.tfidf_profile(items_repository,size,FilterDescription()) | |
| 159 | + profile = tag_profile[:size/2]+desc_profile[:size/2] | |
| 160 | + elif content == "tag_eset": | |
| 161 | + profile = self.eset_profile(items_repository,size,FilterTag(valid_tags)) | |
| 162 | + elif content == "desc_eset": | |
| 163 | + profile = self.eset_profile(items_repository,size,FilterDescription()) | |
| 164 | + elif content == "mix_eset": | |
| 165 | + profile = self.eset_profile(items_repository,size,FilterTag_or_Description(valid_tags)) | |
| 166 | + elif content == "half_eset": | |
| 167 | + tag_profile = self.eset_profile(items_repository,size,FilterTag(valid_tags)) | |
| 168 | + desc_profile = self.eset_profile(items_repository,size,FilterDescription()) | |
| 169 | + profile = tag_profile[:size/2]+desc_profile[:size/2] | |
| 170 | + else: | |
| 171 | + logging.debug("Unknown content type %s." % content) | |
| 172 | + raise Error | |
| 173 | + logging.debug("User %s profile: %s" % (content, profile)) | |
| 124 | 174 | return profile |
| 125 | 175 | |
| 126 | - def tag_profile(self,items_repository,size): | |
| 176 | + def tfidf_profile(self,items_repository,size,content_filter): | |
| 127 | 177 | """ |
| 128 | - Return most relevant tags for a list of packages. | |
| 178 | + Return the most relevant tags for the user list of packages based on | |
| 179 | + the sublinear tfidf weight of packages' tags. | |
| 129 | 180 | """ |
| 130 | - enquire = xapian.Enquire(items_repository) | |
| 131 | 181 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) |
| 132 | - rset_packages = xapian.RSet() | |
| 133 | - for docid in docs: | |
| 134 | - rset_packages.add_document(docid) | |
| 135 | - # statistically good differentiators | |
| 136 | - eset_tags = enquire.get_eset(size, rset_packages, FilterTag()) | |
| 137 | - profile = [res.term for res in eset_tags] | |
| 182 | + #weights = data.tfidf_plus(items_repository,docs,content_filter) | |
| 183 | + weights = data.tfidf_weighting(items_repository,docs,content_filter) | |
| 184 | + # Eliminate duplicated stemmed term | |
| 185 | + profile = self._eliminate_duplicated([w[0] for w in weights],size) | |
| 138 | 186 | return profile |
| 139 | 187 | |
| 140 | - def desc_profile(self,items_repository,size): | |
| 188 | + def eset_profile(self,items_repository,size,content_filter): | |
| 141 | 189 | """ |
| 142 | - Return most relevant keywords for a list of packages based on their | |
| 143 | - text descriptions. | |
| 190 | + Return most relevant tags for a list of packages. | |
| 144 | 191 | """ |
| 192 | + # Store package documents in a relevant set | |
| 145 | 193 | enquire = xapian.Enquire(items_repository) |
| 146 | 194 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) |
| 147 | 195 | rset_packages = xapian.RSet() |
| 148 | - for docid in docs: | |
| 149 | - rset_packages.add_document(docid) | |
| 150 | - eset_keywords = enquire.get_eset(size, rset_packages, | |
| 151 | - FilterDescription()) | |
| 152 | - profile = [res.term for res in eset_keywords] | |
| 196 | + for d in docs: | |
| 197 | + rset_packages.add_document(d.docid) | |
| 198 | + # Get expanded query terms (statistically good differentiators) | |
| 199 | + eset_tags = enquire.get_eset(size*2,rset_packages, | |
| 200 | + xapian.Enquire.INCLUDE_QUERY_TERMS, | |
| 201 | + 1,content_filter) | |
| 202 | + # Eliminate duplicated stemmed term | |
| 203 | + profile = self._eliminate_duplicated([res.term for res in eset_tags],size) | |
| 153 | 204 | return profile |
| 154 | 205 | |
| 155 | - def full_profile(self,items_repository,size): | |
| 156 | - """ | |
| 157 | - Return most relevant tags and keywords for a list of packages based | |
| 158 | - their tags and descriptions. | |
| 159 | - """ | |
| 160 | - tag_profile = self.tag_profile(items_repository,size)[:size/2] | |
| 161 | - desc_profile = self.desc_profile(items_repository,size)[:size/2] | |
| 162 | - return tag_profile+desc_profile | |
| 206 | + def _eliminate_duplicated(self,sorted_list,size): | |
| 207 | + profile = sorted_list[:size] | |
| 208 | + next_index = size | |
| 209 | + duplicate = 1 | |
| 210 | + while duplicate: | |
| 211 | + duplicate = 0 | |
| 212 | + for term in profile[:]: | |
| 213 | + if term.startswith("Z"): | |
| 214 | + for p in profile[:]: | |
| 215 | + if p.startswith(term.lstrip("Z")): | |
| 216 | + duplicate = 1 | |
| 217 | + profile.remove(p) | |
| 218 | + if len(sorted_list)>next_index: | |
| 219 | + profile.append(sorted_list[next_index]) | |
| 220 | + next_index +=1 | |
| 221 | + return profile | |
| 163 | 222 | |
| 164 | 223 | def filter_pkg_profile(self,filter_list_or_file): |
| 165 | 224 | """ |
| ... | ... | @@ -172,7 +231,8 @@ class User: |
| 172 | 231 | with open(filter_list_or_file) as valid: |
| 173 | 232 | valid_pkgs = [line.strip() for line in valid] |
| 174 | 233 | except IOError: |
| 175 | - logging.critical("Could not open profile filter file.") | |
| 234 | + logging.critical("Could not open profile filter file: %" % | |
| 235 | + filter_list_or_file) | |
| 176 | 236 | raise Error |
| 177 | 237 | else: |
| 178 | 238 | logging.debug("No filter provided for user profiling.") |
| ... | ... | @@ -229,15 +289,17 @@ class RandomPopcon(User): |
| 229 | 289 | User.__init__(self,submission.packages,submission.user_id) |
| 230 | 290 | |
| 231 | 291 | class PopconSystem(User): |
| 232 | - def __init__(self,path): | |
| 292 | + def __init__(self,path,user_id=0): | |
| 233 | 293 | """ |
| 234 | 294 | Set initial parameters. |
| 235 | 295 | """ |
| 236 | 296 | submission = data.PopconSubmission(path) |
| 237 | - User.__init__(self,submission.packages,submission.user_id) | |
| 297 | + if not user_id: | |
| 298 | + user_id = submission.user_id | |
| 299 | + User.__init__(self,submission.packages,user_id) | |
| 238 | 300 | |
| 239 | 301 | class PkgsListSystem(User): |
| 240 | - def __init__(self,pkgs_list_or_file): | |
| 302 | + def __init__(self,pkgs_list_or_file,user_id=0): | |
| 241 | 303 | """ |
| 242 | 304 | Set initial parameters. |
| 243 | 305 | """ |
| ... | ... | @@ -254,7 +316,7 @@ class PkgsListSystem(User): |
| 254 | 316 | logging.debug("No packages provided for user profiling.") |
| 255 | 317 | return self.pkg_profile |
| 256 | 318 | |
| 257 | - User.__init__(self,dict.fromkeys(pkgs_list,1)) | |
| 319 | + User.__init__(self,dict.fromkeys(pkgs_list,1),user_id) | |
| 258 | 320 | |
| 259 | 321 | class LocalSystem(User): |
| 260 | 322 | """ | ... | ... |