Commit 1aed15a54b9f4f80ee0f25dc45088fb558661765
1 parent
5af15ae1
Exists in
master
and in
1 other branch
User profiling methods refactoring.
Showing
1 changed file
with
101 additions
and
39 deletions
Show diff stats
src/user.py
| @@ -29,16 +29,29 @@ import apt | @@ -29,16 +29,29 @@ import apt | ||
| 29 | from error import Error | 29 | from error import Error |
| 30 | from singleton import Singleton | 30 | from singleton import Singleton |
| 31 | import data | 31 | import data |
| 32 | +import operator | ||
| 33 | +import math | ||
| 32 | 34 | ||
| 33 | class FilterTag(xapian.ExpandDecider): | 35 | class FilterTag(xapian.ExpandDecider): |
| 34 | """ | 36 | """ |
| 35 | Extend xapian.ExpandDecider to consider only tag terms. | 37 | Extend xapian.ExpandDecider to consider only tag terms. |
| 36 | """ | 38 | """ |
| 39 | + def __init__(self, valid_tags): | ||
| 40 | + """ | ||
| 41 | + Set initial parameters. | ||
| 42 | + """ | ||
| 43 | + xapian.ExpandDecider.__init__(self) | ||
| 44 | + self.valid_tags = valid_tags | ||
| 45 | + | ||
| 37 | def __call__(self, term): | 46 | def __call__(self, term): |
| 38 | """ | 47 | """ |
| 39 | Return true if the term is a tag, else false. | 48 | Return true if the term is a tag, else false. |
| 40 | """ | 49 | """ |
| 41 | - return term.startswith("XT") | 50 | + if self.valid_tags: |
| 51 | + is_valid = term.lstrip("XT") in self.valid_tags | ||
| 52 | + else: | ||
| 53 | + is_valid = 1 | ||
| 54 | + return term.startswith("XT") and is_valid | ||
| 42 | 55 | ||
| 43 | class FilterDescription(xapian.ExpandDecider): | 56 | class FilterDescription(xapian.ExpandDecider): |
| 44 | """ | 57 | """ |
| @@ -51,6 +64,26 @@ class FilterDescription(xapian.ExpandDecider): | @@ -51,6 +64,26 @@ class FilterDescription(xapian.ExpandDecider): | ||
| 51 | """ | 64 | """ |
| 52 | return term.islower() or term.startswith("Z") | 65 | return term.islower() or term.startswith("Z") |
| 53 | 66 | ||
| 67 | +class FilterTag_or_Description(xapian.ExpandDecider): | ||
| 68 | + """ | ||
| 69 | + Extend xapian.ExpandDecider to consider only package description terms. | ||
| 70 | + """ | ||
| 71 | + def __init__(self, valid_tags): | ||
| 72 | + """ | ||
| 73 | + Set initial parameters. | ||
| 74 | + """ | ||
| 75 | + xapian.ExpandDecider.__init__(self) | ||
| 76 | + self.valid_tags = valid_tags | ||
| 77 | + | ||
| 78 | + def __call__(self, term): | ||
| 79 | + """ | ||
| 80 | + Return true if the term or its stemmed version is part of a package | ||
| 81 | + description. | ||
| 82 | + """ | ||
| 83 | + is_tag = FilterTag(self.valid_tags)(term) | ||
| 84 | + is_description = FilterDescription()(term) | ||
| 85 | + return is_tag or is_description | ||
| 86 | + | ||
| 54 | class DemographicProfile(Singleton): | 87 | class DemographicProfile(Singleton): |
| 55 | def __init__(self): | 88 | def __init__(self): |
| 56 | self.admin = set(["admin", "hardware", "mail", "protocol", | 89 | self.admin = set(["admin", "hardware", "mail", "protocol", |
| @@ -109,57 +142,83 @@ class User: | @@ -109,57 +142,83 @@ class User: | ||
| 109 | """ | 142 | """ |
| 110 | self.demographic_profile = DemographicProfile()(profiles_set) | 143 | self.demographic_profile = DemographicProfile()(profiles_set) |
| 111 | 144 | ||
| 112 | - def content_profile(self,items_repository,content,size): | 145 | + def content_profile(self,items_repository,content,size,valid_tags=0): |
| 113 | """ | 146 | """ |
| 114 | Get user profile for a specific type of content: packages tags, | 147 | Get user profile for a specific type of content: packages tags, |
| 115 | - description or both (full_profile) | 148 | + description or both (mixed and half-half profiles) |
| 116 | """ | 149 | """ |
| 117 | if content == "tag": | 150 | if content == "tag": |
| 118 | - profile = self.tag_profile(items_repository,size) | ||
| 119 | - if content == "desc": | ||
| 120 | - profile = self.desc_profile(items_repository,size) | ||
| 121 | - if content == "full": | ||
| 122 | - profile = self.full_profile(items_repository,size) | ||
| 123 | - logging.debug("User profile: %s" % profile) | 151 | + profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags)) |
| 152 | + elif content == "desc": | ||
| 153 | + profile = self.tfidf_profile(items_repository,size,FilterDescription()) | ||
| 154 | + elif content == "mix": | ||
| 155 | + profile = self.tfidf_profile(items_repository,size,FilterTag_or_Description(valid_tags)) | ||
| 156 | + elif content == "half": | ||
| 157 | + tag_profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags)) | ||
| 158 | + desc_profile = self.tfidf_profile(items_repository,size,FilterDescription()) | ||
| 159 | + profile = tag_profile[:size/2]+desc_profile[:size/2] | ||
| 160 | + elif content == "tag_eset": | ||
| 161 | + profile = self.eset_profile(items_repository,size,FilterTag(valid_tags)) | ||
| 162 | + elif content == "desc_eset": | ||
| 163 | + profile = self.eset_profile(items_repository,size,FilterDescription()) | ||
| 164 | + elif content == "mix_eset": | ||
| 165 | + profile = self.eset_profile(items_repository,size,FilterTag_or_Description(valid_tags)) | ||
| 166 | + elif content == "half_eset": | ||
| 167 | + tag_profile = self.eset_profile(items_repository,size,FilterTag(valid_tags)) | ||
| 168 | + desc_profile = self.eset_profile(items_repository,size,FilterDescription()) | ||
| 169 | + profile = tag_profile[:size/2]+desc_profile[:size/2] | ||
| 170 | + else: | ||
| 171 | + logging.debug("Unknown content type %s." % content) | ||
| 172 | + raise Error | ||
| 173 | + logging.debug("User %s profile: %s" % (content, profile)) | ||
| 124 | return profile | 174 | return profile |
| 125 | 175 | ||
| 126 | - def tag_profile(self,items_repository,size): | 176 | + def tfidf_profile(self,items_repository,size,content_filter): |
| 127 | """ | 177 | """ |
| 128 | - Return most relevant tags for a list of packages. | 178 | + Return the most relevant tags for the user list of packages based on |
| 179 | + the sublinear tfidf weight of packages' tags. | ||
| 129 | """ | 180 | """ |
| 130 | - enquire = xapian.Enquire(items_repository) | ||
| 131 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) | 181 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) |
| 132 | - rset_packages = xapian.RSet() | ||
| 133 | - for docid in docs: | ||
| 134 | - rset_packages.add_document(docid) | ||
| 135 | - # statistically good differentiators | ||
| 136 | - eset_tags = enquire.get_eset(size, rset_packages, FilterTag()) | ||
| 137 | - profile = [res.term for res in eset_tags] | 182 | + #weights = data.tfidf_plus(items_repository,docs,content_filter) |
| 183 | + weights = data.tfidf_weighting(items_repository,docs,content_filter) | ||
| 184 | + # Eliminate duplicated stemmed term | ||
| 185 | + profile = self._eliminate_duplicated([w[0] for w in weights],size) | ||
| 138 | return profile | 186 | return profile |
| 139 | 187 | ||
| 140 | - def desc_profile(self,items_repository,size): | 188 | + def eset_profile(self,items_repository,size,content_filter): |
| 141 | """ | 189 | """ |
| 142 | - Return most relevant keywords for a list of packages based on their | ||
| 143 | - text descriptions. | 190 | + Return most relevant tags for a list of packages. |
| 144 | """ | 191 | """ |
| 192 | + # Store package documents in a relevant set | ||
| 145 | enquire = xapian.Enquire(items_repository) | 193 | enquire = xapian.Enquire(items_repository) |
| 146 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) | 194 | docs = data.axi_search_pkgs(items_repository,self.pkg_profile) |
| 147 | rset_packages = xapian.RSet() | 195 | rset_packages = xapian.RSet() |
| 148 | - for docid in docs: | ||
| 149 | - rset_packages.add_document(docid) | ||
| 150 | - eset_keywords = enquire.get_eset(size, rset_packages, | ||
| 151 | - FilterDescription()) | ||
| 152 | - profile = [res.term for res in eset_keywords] | 196 | + for d in docs: |
| 197 | + rset_packages.add_document(d.docid) | ||
| 198 | + # Get expanded query terms (statistically good differentiators) | ||
| 199 | + eset_tags = enquire.get_eset(size*2,rset_packages, | ||
| 200 | + xapian.Enquire.INCLUDE_QUERY_TERMS, | ||
| 201 | + 1,content_filter) | ||
| 202 | + # Eliminate duplicated stemmed term | ||
| 203 | + profile = self._eliminate_duplicated([res.term for res in eset_tags],size) | ||
| 153 | return profile | 204 | return profile |
| 154 | 205 | ||
| 155 | - def full_profile(self,items_repository,size): | ||
| 156 | - """ | ||
| 157 | - Return most relevant tags and keywords for a list of packages based | ||
| 158 | - their tags and descriptions. | ||
| 159 | - """ | ||
| 160 | - tag_profile = self.tag_profile(items_repository,size)[:size/2] | ||
| 161 | - desc_profile = self.desc_profile(items_repository,size)[:size/2] | ||
| 162 | - return tag_profile+desc_profile | 206 | + def _eliminate_duplicated(self,sorted_list,size): |
| 207 | + profile = sorted_list[:size] | ||
| 208 | + next_index = size | ||
| 209 | + duplicate = 1 | ||
| 210 | + while duplicate: | ||
| 211 | + duplicate = 0 | ||
| 212 | + for term in profile[:]: | ||
| 213 | + if term.startswith("Z"): | ||
| 214 | + for p in profile[:]: | ||
| 215 | + if p.startswith(term.lstrip("Z")): | ||
| 216 | + duplicate = 1 | ||
| 217 | + profile.remove(p) | ||
| 218 | + if len(sorted_list)>next_index: | ||
| 219 | + profile.append(sorted_list[next_index]) | ||
| 220 | + next_index +=1 | ||
| 221 | + return profile | ||
| 163 | 222 | ||
| 164 | def filter_pkg_profile(self,filter_list_or_file): | 223 | def filter_pkg_profile(self,filter_list_or_file): |
| 165 | """ | 224 | """ |
| @@ -172,7 +231,8 @@ class User: | @@ -172,7 +231,8 @@ class User: | ||
| 172 | with open(filter_list_or_file) as valid: | 231 | with open(filter_list_or_file) as valid: |
| 173 | valid_pkgs = [line.strip() for line in valid] | 232 | valid_pkgs = [line.strip() for line in valid] |
| 174 | except IOError: | 233 | except IOError: |
| 175 | - logging.critical("Could not open profile filter file.") | 234 | + logging.critical("Could not open profile filter file: %" % |
| 235 | + filter_list_or_file) | ||
| 176 | raise Error | 236 | raise Error |
| 177 | else: | 237 | else: |
| 178 | logging.debug("No filter provided for user profiling.") | 238 | logging.debug("No filter provided for user profiling.") |
| @@ -229,15 +289,17 @@ class RandomPopcon(User): | @@ -229,15 +289,17 @@ class RandomPopcon(User): | ||
| 229 | User.__init__(self,submission.packages,submission.user_id) | 289 | User.__init__(self,submission.packages,submission.user_id) |
| 230 | 290 | ||
| 231 | class PopconSystem(User): | 291 | class PopconSystem(User): |
| 232 | - def __init__(self,path): | 292 | + def __init__(self,path,user_id=0): |
| 233 | """ | 293 | """ |
| 234 | Set initial parameters. | 294 | Set initial parameters. |
| 235 | """ | 295 | """ |
| 236 | submission = data.PopconSubmission(path) | 296 | submission = data.PopconSubmission(path) |
| 237 | - User.__init__(self,submission.packages,submission.user_id) | 297 | + if not user_id: |
| 298 | + user_id = submission.user_id | ||
| 299 | + User.__init__(self,submission.packages,user_id) | ||
| 238 | 300 | ||
| 239 | class PkgsListSystem(User): | 301 | class PkgsListSystem(User): |
| 240 | - def __init__(self,pkgs_list_or_file): | 302 | + def __init__(self,pkgs_list_or_file,user_id=0): |
| 241 | """ | 303 | """ |
| 242 | Set initial parameters. | 304 | Set initial parameters. |
| 243 | """ | 305 | """ |
| @@ -254,7 +316,7 @@ class PkgsListSystem(User): | @@ -254,7 +316,7 @@ class PkgsListSystem(User): | ||
| 254 | logging.debug("No packages provided for user profiling.") | 316 | logging.debug("No packages provided for user profiling.") |
| 255 | return self.pkg_profile | 317 | return self.pkg_profile |
| 256 | 318 | ||
| 257 | - User.__init__(self,dict.fromkeys(pkgs_list,1)) | 319 | + User.__init__(self,dict.fromkeys(pkgs_list,1),user_id) |
| 258 | 320 | ||
| 259 | class LocalSystem(User): | 321 | class LocalSystem(User): |
| 260 | """ | 322 | """ |