Commit 1aed15a54b9f4f80ee0f25dc45088fb558661765

Authored by Tássia Camões Araújo
1 parent 5af15ae1
Exists in master and in 1 other branch add_vagrant

User profiling methods refactoring.

Showing 1 changed file with 101 additions and 39 deletions   Show diff stats
src/user.py
... ... @@ -29,16 +29,29 @@ import apt
29 29 from error import Error
30 30 from singleton import Singleton
31 31 import data
  32 +import operator
  33 +import math
32 34  
33 35 class FilterTag(xapian.ExpandDecider):
34 36 """
35 37 Extend xapian.ExpandDecider to consider only tag terms.
36 38 """
  39 + def __init__(self, valid_tags):
  40 + """
  41 + Set initial parameters.
  42 + """
  43 + xapian.ExpandDecider.__init__(self)
  44 + self.valid_tags = valid_tags
  45 +
37 46 def __call__(self, term):
38 47 """
39 48 Return true if the term is a tag, else false.
40 49 """
41   - return term.startswith("XT")
  50 + if self.valid_tags:
  51 + is_valid = term.lstrip("XT") in self.valid_tags
  52 + else:
  53 + is_valid = 1
  54 + return term.startswith("XT") and is_valid
42 55  
43 56 class FilterDescription(xapian.ExpandDecider):
44 57 """
... ... @@ -51,6 +64,26 @@ class FilterDescription(xapian.ExpandDecider):
51 64 """
52 65 return term.islower() or term.startswith("Z")
53 66  
  67 +class FilterTag_or_Description(xapian.ExpandDecider):
  68 + """
  69 + Extend xapian.ExpandDecider to consider only package description terms.
  70 + """
  71 + def __init__(self, valid_tags):
  72 + """
  73 + Set initial parameters.
  74 + """
  75 + xapian.ExpandDecider.__init__(self)
  76 + self.valid_tags = valid_tags
  77 +
  78 + def __call__(self, term):
  79 + """
  80 + Return true if the term or its stemmed version is part of a package
  81 + description.
  82 + """
  83 + is_tag = FilterTag(self.valid_tags)(term)
  84 + is_description = FilterDescription()(term)
  85 + return is_tag or is_description
  86 +
54 87 class DemographicProfile(Singleton):
55 88 def __init__(self):
56 89 self.admin = set(["admin", "hardware", "mail", "protocol",
... ... @@ -109,57 +142,83 @@ class User:
109 142 """
110 143 self.demographic_profile = DemographicProfile()(profiles_set)
111 144  
112   - def content_profile(self,items_repository,content,size):
  145 + def content_profile(self,items_repository,content,size,valid_tags=0):
113 146 """
114 147 Get user profile for a specific type of content: packages tags,
115   - description or both (full_profile)
  148 + description or both (mixed and half-half profiles)
116 149 """
117 150 if content == "tag":
118   - profile = self.tag_profile(items_repository,size)
119   - if content == "desc":
120   - profile = self.desc_profile(items_repository,size)
121   - if content == "full":
122   - profile = self.full_profile(items_repository,size)
123   - logging.debug("User profile: %s" % profile)
  151 + profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags))
  152 + elif content == "desc":
  153 + profile = self.tfidf_profile(items_repository,size,FilterDescription())
  154 + elif content == "mix":
  155 + profile = self.tfidf_profile(items_repository,size,FilterTag_or_Description(valid_tags))
  156 + elif content == "half":
  157 + tag_profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags))
  158 + desc_profile = self.tfidf_profile(items_repository,size,FilterDescription())
  159 + profile = tag_profile[:size/2]+desc_profile[:size/2]
  160 + elif content == "tag_eset":
  161 + profile = self.eset_profile(items_repository,size,FilterTag(valid_tags))
  162 + elif content == "desc_eset":
  163 + profile = self.eset_profile(items_repository,size,FilterDescription())
  164 + elif content == "mix_eset":
  165 + profile = self.eset_profile(items_repository,size,FilterTag_or_Description(valid_tags))
  166 + elif content == "half_eset":
  167 + tag_profile = self.eset_profile(items_repository,size,FilterTag(valid_tags))
  168 + desc_profile = self.eset_profile(items_repository,size,FilterDescription())
  169 + profile = tag_profile[:size/2]+desc_profile[:size/2]
  170 + else:
  171 + logging.debug("Unknown content type %s." % content)
  172 + raise Error
  173 + logging.debug("User %s profile: %s" % (content, profile))
124 174 return profile
125 175  
126   - def tag_profile(self,items_repository,size):
  176 + def tfidf_profile(self,items_repository,size,content_filter):
127 177 """
128   - Return most relevant tags for a list of packages.
  178 + Return the most relevant tags for the user list of packages based on
  179 + the sublinear tfidf weight of packages' tags.
129 180 """
130   - enquire = xapian.Enquire(items_repository)
131 181 docs = data.axi_search_pkgs(items_repository,self.pkg_profile)
132   - rset_packages = xapian.RSet()
133   - for docid in docs:
134   - rset_packages.add_document(docid)
135   - # statistically good differentiators
136   - eset_tags = enquire.get_eset(size, rset_packages, FilterTag())
137   - profile = [res.term for res in eset_tags]
  182 + #weights = data.tfidf_plus(items_repository,docs,content_filter)
  183 + weights = data.tfidf_weighting(items_repository,docs,content_filter)
  184 + # Eliminate duplicated stemmed term
  185 + profile = self._eliminate_duplicated([w[0] for w in weights],size)
138 186 return profile
139 187  
140   - def desc_profile(self,items_repository,size):
  188 + def eset_profile(self,items_repository,size,content_filter):
141 189 """
142   - Return most relevant keywords for a list of packages based on their
143   - text descriptions.
  190 + Return most relevant tags for a list of packages.
144 191 """
  192 + # Store package documents in a relevant set
145 193 enquire = xapian.Enquire(items_repository)
146 194 docs = data.axi_search_pkgs(items_repository,self.pkg_profile)
147 195 rset_packages = xapian.RSet()
148   - for docid in docs:
149   - rset_packages.add_document(docid)
150   - eset_keywords = enquire.get_eset(size, rset_packages,
151   - FilterDescription())
152   - profile = [res.term for res in eset_keywords]
  196 + for d in docs:
  197 + rset_packages.add_document(d.docid)
  198 + # Get expanded query terms (statistically good differentiators)
  199 + eset_tags = enquire.get_eset(size*2,rset_packages,
  200 + xapian.Enquire.INCLUDE_QUERY_TERMS,
  201 + 1,content_filter)
  202 + # Eliminate duplicated stemmed term
  203 + profile = self._eliminate_duplicated([res.term for res in eset_tags],size)
153 204 return profile
154 205  
155   - def full_profile(self,items_repository,size):
156   - """
157   - Return most relevant tags and keywords for a list of packages based
158   - their tags and descriptions.
159   - """
160   - tag_profile = self.tag_profile(items_repository,size)[:size/2]
161   - desc_profile = self.desc_profile(items_repository,size)[:size/2]
162   - return tag_profile+desc_profile
  206 + def _eliminate_duplicated(self,sorted_list,size):
  207 + profile = sorted_list[:size]
  208 + next_index = size
  209 + duplicate = 1
  210 + while duplicate:
  211 + duplicate = 0
  212 + for term in profile[:]:
  213 + if term.startswith("Z"):
  214 + for p in profile[:]:
  215 + if p.startswith(term.lstrip("Z")):
  216 + duplicate = 1
  217 + profile.remove(p)
  218 + if len(sorted_list)>next_index:
  219 + profile.append(sorted_list[next_index])
  220 + next_index +=1
  221 + return profile
163 222  
164 223 def filter_pkg_profile(self,filter_list_or_file):
165 224 """
... ... @@ -172,7 +231,8 @@ class User:
172 231 with open(filter_list_or_file) as valid:
173 232 valid_pkgs = [line.strip() for line in valid]
174 233 except IOError:
175   - logging.critical("Could not open profile filter file.")
  234 + logging.critical("Could not open profile filter file: %" %
  235 + filter_list_or_file)
176 236 raise Error
177 237 else:
178 238 logging.debug("No filter provided for user profiling.")
... ... @@ -229,15 +289,17 @@ class RandomPopcon(User):
229 289 User.__init__(self,submission.packages,submission.user_id)
230 290  
231 291 class PopconSystem(User):
232   - def __init__(self,path):
  292 + def __init__(self,path,user_id=0):
233 293 """
234 294 Set initial parameters.
235 295 """
236 296 submission = data.PopconSubmission(path)
237   - User.__init__(self,submission.packages,submission.user_id)
  297 + if not user_id:
  298 + user_id = submission.user_id
  299 + User.__init__(self,submission.packages,user_id)
238 300  
239 301 class PkgsListSystem(User):
240   - def __init__(self,pkgs_list_or_file):
  302 + def __init__(self,pkgs_list_or_file,user_id=0):
241 303 """
242 304 Set initial parameters.
243 305 """
... ... @@ -254,7 +316,7 @@ class PkgsListSystem(User):
254 316 logging.debug("No packages provided for user profiling.")
255 317 return self.pkg_profile
256 318  
257   - User.__init__(self,dict.fromkeys(pkgs_list,1))
  319 + User.__init__(self,dict.fromkeys(pkgs_list,1),user_id)
258 320  
259 321 class LocalSystem(User):
260 322 """
... ...