Commit 1aed15a54b9f4f80ee0f25dc45088fb558661765

Authored by Tássia Camões Araújo
1 parent 5af15ae1
Exists in master and in 1 other branch add_vagrant

User profiling methods refactoring.

Showing 1 changed file with 101 additions and 39 deletions   Show diff stats
@@ -29,16 +29,29 @@ import apt @@ -29,16 +29,29 @@ import apt
29 from error import Error 29 from error import Error
30 from singleton import Singleton 30 from singleton import Singleton
31 import data 31 import data
  32 +import operator
  33 +import math
32 34
33 class FilterTag(xapian.ExpandDecider): 35 class FilterTag(xapian.ExpandDecider):
34 """ 36 """
35 Extend xapian.ExpandDecider to consider only tag terms. 37 Extend xapian.ExpandDecider to consider only tag terms.
36 """ 38 """
  39 + def __init__(self, valid_tags):
  40 + """
  41 + Set initial parameters.
  42 + """
  43 + xapian.ExpandDecider.__init__(self)
  44 + self.valid_tags = valid_tags
  45 +
37 def __call__(self, term): 46 def __call__(self, term):
38 """ 47 """
39 Return true if the term is a tag, else false. 48 Return true if the term is a tag, else false.
40 """ 49 """
41 - return term.startswith("XT") 50 + if self.valid_tags:
  51 + is_valid = term.lstrip("XT") in self.valid_tags
  52 + else:
  53 + is_valid = 1
  54 + return term.startswith("XT") and is_valid
42 55
43 class FilterDescription(xapian.ExpandDecider): 56 class FilterDescription(xapian.ExpandDecider):
44 """ 57 """
@@ -51,6 +64,26 @@ class FilterDescription(xapian.ExpandDecider): @@ -51,6 +64,26 @@ class FilterDescription(xapian.ExpandDecider):
51 """ 64 """
52 return term.islower() or term.startswith("Z") 65 return term.islower() or term.startswith("Z")
53 66
  67 +class FilterTag_or_Description(xapian.ExpandDecider):
  68 + """
  69 + Extend xapian.ExpandDecider to consider only package description terms.
  70 + """
  71 + def __init__(self, valid_tags):
  72 + """
  73 + Set initial parameters.
  74 + """
  75 + xapian.ExpandDecider.__init__(self)
  76 + self.valid_tags = valid_tags
  77 +
  78 + def __call__(self, term):
  79 + """
  80 + Return true if the term or its stemmed version is part of a package
  81 + description.
  82 + """
  83 + is_tag = FilterTag(self.valid_tags)(term)
  84 + is_description = FilterDescription()(term)
  85 + return is_tag or is_description
  86 +
54 class DemographicProfile(Singleton): 87 class DemographicProfile(Singleton):
55 def __init__(self): 88 def __init__(self):
56 self.admin = set(["admin", "hardware", "mail", "protocol", 89 self.admin = set(["admin", "hardware", "mail", "protocol",
@@ -109,57 +142,83 @@ class User: @@ -109,57 +142,83 @@ class User:
109 """ 142 """
110 self.demographic_profile = DemographicProfile()(profiles_set) 143 self.demographic_profile = DemographicProfile()(profiles_set)
111 144
112 - def content_profile(self,items_repository,content,size): 145 + def content_profile(self,items_repository,content,size,valid_tags=0):
113 """ 146 """
114 Get user profile for a specific type of content: packages tags, 147 Get user profile for a specific type of content: packages tags,
115 - description or both (full_profile) 148 + description or both (mixed and half-half profiles)
116 """ 149 """
117 if content == "tag": 150 if content == "tag":
118 - profile = self.tag_profile(items_repository,size)  
119 - if content == "desc":  
120 - profile = self.desc_profile(items_repository,size)  
121 - if content == "full":  
122 - profile = self.full_profile(items_repository,size)  
123 - logging.debug("User profile: %s" % profile) 151 + profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags))
  152 + elif content == "desc":
  153 + profile = self.tfidf_profile(items_repository,size,FilterDescription())
  154 + elif content == "mix":
  155 + profile = self.tfidf_profile(items_repository,size,FilterTag_or_Description(valid_tags))
  156 + elif content == "half":
  157 + tag_profile = self.tfidf_profile(items_repository,size,FilterTag(valid_tags))
  158 + desc_profile = self.tfidf_profile(items_repository,size,FilterDescription())
  159 + profile = tag_profile[:size/2]+desc_profile[:size/2]
  160 + elif content == "tag_eset":
  161 + profile = self.eset_profile(items_repository,size,FilterTag(valid_tags))
  162 + elif content == "desc_eset":
  163 + profile = self.eset_profile(items_repository,size,FilterDescription())
  164 + elif content == "mix_eset":
  165 + profile = self.eset_profile(items_repository,size,FilterTag_or_Description(valid_tags))
  166 + elif content == "half_eset":
  167 + tag_profile = self.eset_profile(items_repository,size,FilterTag(valid_tags))
  168 + desc_profile = self.eset_profile(items_repository,size,FilterDescription())
  169 + profile = tag_profile[:size/2]+desc_profile[:size/2]
  170 + else:
  171 + logging.debug("Unknown content type %s." % content)
  172 + raise Error
  173 + logging.debug("User %s profile: %s" % (content, profile))
124 return profile 174 return profile
125 175
126 - def tag_profile(self,items_repository,size): 176 + def tfidf_profile(self,items_repository,size,content_filter):
127 """ 177 """
128 - Return most relevant tags for a list of packages. 178 + Return the most relevant tags for the user list of packages based on
  179 + the sublinear tfidf weight of packages' tags.
129 """ 180 """
130 - enquire = xapian.Enquire(items_repository)  
131 docs = data.axi_search_pkgs(items_repository,self.pkg_profile) 181 docs = data.axi_search_pkgs(items_repository,self.pkg_profile)
132 - rset_packages = xapian.RSet()  
133 - for docid in docs:  
134 - rset_packages.add_document(docid)  
135 - # statistically good differentiators  
136 - eset_tags = enquire.get_eset(size, rset_packages, FilterTag())  
137 - profile = [res.term for res in eset_tags] 182 + #weights = data.tfidf_plus(items_repository,docs,content_filter)
  183 + weights = data.tfidf_weighting(items_repository,docs,content_filter)
  184 + # Eliminate duplicated stemmed term
  185 + profile = self._eliminate_duplicated([w[0] for w in weights],size)
138 return profile 186 return profile
139 187
140 - def desc_profile(self,items_repository,size): 188 + def eset_profile(self,items_repository,size,content_filter):
141 """ 189 """
142 - Return most relevant keywords for a list of packages based on their  
143 - text descriptions. 190 + Return most relevant tags for a list of packages.
144 """ 191 """
  192 + # Store package documents in a relevant set
145 enquire = xapian.Enquire(items_repository) 193 enquire = xapian.Enquire(items_repository)
146 docs = data.axi_search_pkgs(items_repository,self.pkg_profile) 194 docs = data.axi_search_pkgs(items_repository,self.pkg_profile)
147 rset_packages = xapian.RSet() 195 rset_packages = xapian.RSet()
148 - for docid in docs:  
149 - rset_packages.add_document(docid)  
150 - eset_keywords = enquire.get_eset(size, rset_packages,  
151 - FilterDescription())  
152 - profile = [res.term for res in eset_keywords] 196 + for d in docs:
  197 + rset_packages.add_document(d.docid)
  198 + # Get expanded query terms (statistically good differentiators)
  199 + eset_tags = enquire.get_eset(size*2,rset_packages,
  200 + xapian.Enquire.INCLUDE_QUERY_TERMS,
  201 + 1,content_filter)
  202 + # Eliminate duplicated stemmed term
  203 + profile = self._eliminate_duplicated([res.term for res in eset_tags],size)
153 return profile 204 return profile
154 205
155 - def full_profile(self,items_repository,size):  
156 - """  
157 - Return most relevant tags and keywords for a list of packages based  
158 - their tags and descriptions.  
159 - """  
160 - tag_profile = self.tag_profile(items_repository,size)[:size/2]  
161 - desc_profile = self.desc_profile(items_repository,size)[:size/2]  
162 - return tag_profile+desc_profile 206 + def _eliminate_duplicated(self,sorted_list,size):
  207 + profile = sorted_list[:size]
  208 + next_index = size
  209 + duplicate = 1
  210 + while duplicate:
  211 + duplicate = 0
  212 + for term in profile[:]:
  213 + if term.startswith("Z"):
  214 + for p in profile[:]:
  215 + if p.startswith(term.lstrip("Z")):
  216 + duplicate = 1
  217 + profile.remove(p)
  218 + if len(sorted_list)>next_index:
  219 + profile.append(sorted_list[next_index])
  220 + next_index +=1
  221 + return profile
163 222
164 def filter_pkg_profile(self,filter_list_or_file): 223 def filter_pkg_profile(self,filter_list_or_file):
165 """ 224 """
@@ -172,7 +231,8 @@ class User: @@ -172,7 +231,8 @@ class User:
172 with open(filter_list_or_file) as valid: 231 with open(filter_list_or_file) as valid:
173 valid_pkgs = [line.strip() for line in valid] 232 valid_pkgs = [line.strip() for line in valid]
174 except IOError: 233 except IOError:
175 - logging.critical("Could not open profile filter file.") 234 + logging.critical("Could not open profile filter file: %" %
  235 + filter_list_or_file)
176 raise Error 236 raise Error
177 else: 237 else:
178 logging.debug("No filter provided for user profiling.") 238 logging.debug("No filter provided for user profiling.")
@@ -229,15 +289,17 @@ class RandomPopcon(User): @@ -229,15 +289,17 @@ class RandomPopcon(User):
229 User.__init__(self,submission.packages,submission.user_id) 289 User.__init__(self,submission.packages,submission.user_id)
230 290
231 class PopconSystem(User): 291 class PopconSystem(User):
232 - def __init__(self,path): 292 + def __init__(self,path,user_id=0):
233 """ 293 """
234 Set initial parameters. 294 Set initial parameters.
235 """ 295 """
236 submission = data.PopconSubmission(path) 296 submission = data.PopconSubmission(path)
237 - User.__init__(self,submission.packages,submission.user_id) 297 + if not user_id:
  298 + user_id = submission.user_id
  299 + User.__init__(self,submission.packages,user_id)
238 300
239 class PkgsListSystem(User): 301 class PkgsListSystem(User):
240 - def __init__(self,pkgs_list_or_file): 302 + def __init__(self,pkgs_list_or_file,user_id=0):
241 """ 303 """
242 Set initial parameters. 304 Set initial parameters.
243 """ 305 """
@@ -254,7 +316,7 @@ class PkgsListSystem(User): @@ -254,7 +316,7 @@ class PkgsListSystem(User):
254 logging.debug("No packages provided for user profiling.") 316 logging.debug("No packages provided for user profiling.")
255 return self.pkg_profile 317 return self.pkg_profile
256 318
257 - User.__init__(self,dict.fromkeys(pkgs_list,1)) 319 + User.__init__(self,dict.fromkeys(pkgs_list,1),user_id)
258 320
259 class LocalSystem(User): 321 class LocalSystem(User):
260 """ 322 """