Commit 8503780dbddd8c15b311ddf503b63794923e39cf

Authored by Tássia Camões Araújo
1 parent b2ea1ba9
Exists in master and in 1 other branch add_vagrant

data.py: implmented new class SampleAptXapianIndex to perform as data source

for tests and new search functions to retrieve packages and tags directly from apt-xapian-index;
user.py: new user profile methods are based on the kind of content: tag,
description or both;
Implemented more tests for user.py and cleaned source-code from deprecated
classes and functions which used debtags_db;
Showing 3 changed files with 143 additions and 188 deletions   Show diff stats
@@ -35,29 +35,44 @@ from singleton import Singleton @@ -35,29 +35,44 @@ from singleton import Singleton
35 import cluster 35 import cluster
36 from dissimilarity import * 36 from dissimilarity import *
37 37
38 -#class Item:  
39 -# """  
40 -# Generic item definition.  
41 -# """  
42 -#  
43 -#class Package(Item):  
44 -# """  
45 -# Definition of a GNU/Linux application as a recommender item.  
46 -# """  
47 -# def __init__(self,package_name):  
48 -# """  
49 -# Set initial attributes.  
50 -# """  
51 -# self.package_name = package_name  
52 -#  
53 -#def normalize_tags(string):  
54 -# """  
55 -# Substitute string characters : by _ and - by '.  
56 -# Examples:  
57 -# admin::package-management -> admin__package'management  
58 -# implemented-in::c++ -> implemented-in__c++  
59 -# """  
60 -# return string.replace(':','_').replace('-','\'') 38 +def axi_search_pkgs(axi,pkgs_list):
  39 + terms = ["XP"+item for item in pkgs_list]
  40 + query = xapian.Query(xapian.Query.OP_OR, terms)
  41 + enquire = xapian.Enquire(axi)
  42 + enquire.set_query(query)
  43 + matches = enquire.get_mset(0,axi.get_doccount())
  44 + return matches
  45 +
  46 +def axi_search_pkg_tags(axi,pkg):
  47 + query = xapian.Query(xapian.Query.OP_OR, "XP"+pkg)
  48 + enquire = xapian.Enquire(axi)
  49 + enquire.set_query(query)
  50 + matches = enquire.get_mset(0,1)
  51 + for m in matches:
  52 + tags = [term.term for term in axi.get_document(m.docid).termlist() if
  53 + term.term.startswith("XT")]
  54 + return tags
  55 +
  56 +class SampleAptXapianIndex(xapian.WritableDatabase):
  57 + """
  58 + Sample data source for packages information, mainly useful for tests.
  59 + """
  60 + def __init__(self,pkgs_list,axi):
  61 + xapian.WritableDatabase.__init__(self,".sample_axi",
  62 + xapian.DB_CREATE_OR_OVERWRITE)
  63 + sample = axi_search_pkgs(axi,pkgs_list)
  64 + self.all_docs = []
  65 + for package in sample:
  66 + doc_id = self.add_document(axi.get_document(package.docid))
  67 + self.all_docs.append(doc_id)
  68 +
  69 + def _print(self):
  70 + print "---"
  71 + print xapian.WritableDatabase.__repr__(self)
  72 + print "---"
  73 + for doc_id in self.all_docs:
  74 + print [term.term for term in self.get_document(doc_id).termlist()]
  75 + print "---"
61 76
62 #[FIXME] get pkg tags from axi and remove load_debtags_db method 77 #[FIXME] get pkg tags from axi and remove load_debtags_db method
63 def load_debtags_db(db_path): 78 def load_debtags_db(db_path):
@@ -75,106 +90,6 @@ def load_debtags_db(db_path): @@ -75,106 +90,6 @@ def load_debtags_db(db_path):
75 logging.error("Could not load DebtagsDB from '%s'." % self.db_path) 90 logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
76 raise Error 91 raise Error
77 92
78 -#class TagsXapianIndex(xapian.WritableDatabase,Singleton):  
79 -# """  
80 -# Data source for tags info defined as a singleton xapian database.  
81 -# """  
82 -# def __init__(self,cfg):  
83 -# """  
84 -# Set initial attributes.  
85 -# """  
86 -# self.path = os.path.expanduser(cfg.tags_index)  
87 -# self.db_path = os.path.expanduser(cfg.tags_db)  
88 -# self.debtags_db = debtags.DB()  
89 -# try:  
90 -# db_file = open(self.db_path)  
91 -# except IOError:  
92 -# logging.error("Could not load DebtagsDB from '%s'." % self.db_path)  
93 -# raise Error  
94 -# md5 = hashlib.md5()  
95 -# md5.update(db_file.read())  
96 -# self.db_md5 = md5.hexdigest()  
97 -# db_file.close()  
98 -# self.load_index(cfg.reindex)  
99 -#  
100 -## def load_db(self):  
101 -## """  
102 -## Load debtags database from the source file.  
103 -## """  
104 -## tag_filter = re.compile(r"^special::.+$|^.+::TODO$")  
105 -## try:  
106 -## db_file = open(self.db_path, "r")  
107 -## self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))  
108 -## db_file.close()  
109 -## except:  
110 -## logging.error("Could not load DebtagsDB from '%s'." % self.db_path)  
111 -## raise Error  
112 -#  
113 -# def relevant_tags_from_db(self,pkgs_list,qtd_of_tags):  
114 -# """  
115 -# Return most relevant tags considering a list of packages.  
116 -# """  
117 -# if not self.debtags_db.package_count():  
118 -# #print "index vazio"  
119 -# self.debtags_db = load_debtags_db(self.db_path)  
120 -# relevant_db = self.debtags_db.choose_packages(pkgs_list)  
121 -# relevance_index = debtags.relevance_index_function(self.debtags_db,  
122 -# relevant_db)  
123 -# sorted_relevant_tags = sorted(relevant_db.iter_tags(),  
124 -# lambda a, b: cmp(relevance_index(a),  
125 -# relevance_index(b)))  
126 -# return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:]))  
127 -#  
128 -# def load_index(self,reindex):  
129 -# """  
130 -# Load an existing debtags index.  
131 -# """  
132 -# if not reindex:  
133 -# try:  
134 -# logging.info("Opening existing debtags xapian index at \'%s\'"  
135 -# % self.path)  
136 -# xapian.Database.__init__(self,self.path)  
137 -# md5 = self.get_metadata("md5")  
138 -# if not md5 == self.db_md5:  
139 -# logging.info("Index must be updated.")  
140 -# reindex = 1  
141 -# except xapian.DatabaseError:  
142 -# logging.info("Could not open debtags index.")  
143 -# reindex =1  
144 -#  
145 -# if reindex:  
146 -# self.new_index()  
147 -#  
148 -# def new_index(self):  
149 -# """  
150 -# Create a xapian index for debtags info based on 'debtags_db' and  
151 -# place it at 'self.path'.  
152 -# """  
153 -# if not os.path.exists(self.path):  
154 -# os.makedirs(self.path)  
155 -#  
156 -# try:  
157 -# logging.info("Indexing debtags info from \'%s\'" %  
158 -# self.db_path)  
159 -# logging.info("Creating new xapian index at \'%s\'" %  
160 -# self.path)  
161 -# xapian.WritableDatabase.__init__(self,self.path,  
162 -# xapian.DB_CREATE_OR_OVERWRITE)  
163 -# except xapian.DatabaseError:  
164 -# logging.critical("Could not create xapian index.")  
165 -# raise Error  
166 -#  
167 -# self.debtags_db = load_debtags_db(self.db_path)  
168 -# self.set_metadata("md5",self.db_md5)  
169 -#  
170 -# for pkg,tags in self.debtags_db.iter_packages_tags():  
171 -# doc = xapian.Document()  
172 -# doc.set_data(pkg)  
173 -# for tag in tags:  
174 -# doc.add_term(normalize_tags(tag))  
175 -# doc_id = self.add_document(doc)  
176 -# logging.debug("Debtags Xapian: Indexing doc %d",doc_id)  
177 -  
178 class PopconXapianIndex(xapian.WritableDatabase,Singleton): 93 class PopconXapianIndex(xapian.WritableDatabase,Singleton):
179 """ 94 """
180 Data source for popcon submissions defined as a singleton xapian database. 95 Data source for popcon submissions defined as a singleton xapian database.
src/tests/user_tests.py
@@ -36,9 +36,12 @@ class UserTests(unittest2.TestCase): @@ -36,9 +36,12 @@ class UserTests(unittest2.TestCase):
36 @classmethod 36 @classmethod
37 def setUpClass(self): 37 def setUpClass(self):
38 cfg = Config() 38 cfg = Config()
39 - #self.axi = xapian.Database(cfg.axi) 39 + self.axi = xapian.Database(cfg.axi)
  40 + sample_packages = ["gimp","aaphoto","eog","emacs","dia","ferret",
  41 + "festival","file","inkscape","xpdf"]
  42 + self.sample_axi = SampleAptXapianIndex(sample_packages,self.axi)
40 self.user = User({"gimp":1,"aaphoto":1,"eog":1,"emacs":1}) 43 self.user = User({"gimp":1,"aaphoto":1,"eog":1,"emacs":1})
41 - self.pxi = PkgXapianIndex("package-xapian-index") 44 + #self.sample_axi._print()
42 45
43 def test_hash(self): 46 def test_hash(self):
44 new_user = User(dict()) 47 new_user = User(dict())
@@ -100,34 +103,34 @@ class UserTests(unittest2.TestCase): @@ -100,34 +103,34 @@ class UserTests(unittest2.TestCase):
100 self.assertEqual(self.user.demographic_profile,desktop_art_admin) 103 self.assertEqual(self.user.demographic_profile,desktop_art_admin)
101 104
102 def test_items(self): 105 def test_items(self):
103 - self.assertEqual(self.user.items(),set(["gimp","aaphoto","eog","emacs"]))  
104 -  
105 - def test_axi_tag_profile(self):  
106 - package_terms = ["XP"+package for package in self.user.items()]  
107 - enquire = xapian.Enquire(self.pxi)  
108 - enquire.set_query(xapian.Query(xapian.Query.OP_OR,package_terms))  
109 - user_packages = enquire.get_mset(0, self.pxi.get_doccount(), None, None)  
110 - tag_terms = []  
111 - for p in user_packages:  
112 - tag_terms = tag_terms + [x.term for x in p.document.termlist() \  
113 - if x.term.startswith("XT")]  
114 - relevant_count = dict([(tag,tag_terms.count(tag)) \  
115 - for tag in set(tag_terms)])  
116 - #rank = {}  
117 - #non_relevant_count = dict()  
118 - #for tag,count in relevant_count.items():  
119 - # non_relevant_count[tag] = self.pxi.get_termfreq(tag)-count  
120 - # if non_relevant_count[tag]>0:  
121 - # rank[tag] = relevant_count[tag]/float(non_relevant_count[tag])  
122 - #print "relevant",relevant_count  
123 - #print "non_relevant",non_relevant_count  
124 - #print sorted(rank.items(), key=operator.itemgetter(1))  
125 - #[FIXME] get ths value based on real ranking  
126 - #print set(self.user.axi_tag_profile(self.pxi,4))  
127 - self.assertEqual(set(self.user.axi_tag_profile(self.pxi,4)),  
128 - set(["XTuse::editing", "XTworks-with::image",  
129 - "XTworks-with-format::png",  
130 - "XTworks-with-format::jpg"])) 106 + self.assertEqual(set(self.user.items()),
  107 + set(["gimp","aaphoto","eog","emacs"]))
  108 +
  109 + def test_profile(self):
  110 + self.assertEqual(self.user.profile(self.sample_axi,"tag",10),
  111 + self.user.tag_profile(self.sample_axi,10))
  112 + self.assertEqual(self.user.profile(self.sample_axi,"desc",10),
  113 + self.user.desc_profile(self.sample_axi,10))
  114 + self.assertEqual(self.user.profile(self.sample_axi,"full",10),
  115 + self.user.full_profile(self.sample_axi,10))
  116 +
  117 + def test_tag_profile(self):
  118 + self.assertEqual(self.user.tag_profile(self.sample_axi,10),
  119 + ['XTuse::editing', 'XTworks-with::image:raster',
  120 + 'XTworks-with-format::png', 'XTworks-with-format::jpg',
  121 + 'XTworks-with::image','XTimplemented-in::c',
  122 + 'XTsuite::gnome', 'XTsuite::emacs',
  123 + 'XTrole::metapackage', 'XTdevel::editor'])
  124 +
  125 + def test_desc_profile(self):
  126 + self.assertEqual(self.user.desc_profile(self.sample_axi,10),
  127 + ['image', 'the', 'which', 'manipulation', 'program',
  128 + 'input', 'a', 'gnu', 'images', 'this'])
  129 +
  130 + def test_full_profile(self):
  131 + self.assertEqual(self.user.full_profile(self.sample_axi,10),
  132 + (self.user.tag_profile(self.sample_axi,5)+
  133 + self.user.desc_profile(self.sample_axi,5)))
131 134
132 def test_maximal_pkg_profile(self): 135 def test_maximal_pkg_profile(self):
133 old_pkg_profile = self.user.items() 136 old_pkg_profile = self.user.items()
@@ -25,6 +25,7 @@ import xapian @@ -25,6 +25,7 @@ import xapian
25 import logging 25 import logging
26 import apt 26 import apt
27 from singleton import Singleton 27 from singleton import Singleton
  28 +import data
28 29
29 class FilterTag(xapian.ExpandDecider): 30 class FilterTag(xapian.ExpandDecider):
30 """ 31 """
@@ -34,7 +35,17 @@ class FilterTag(xapian.ExpandDecider): @@ -34,7 +35,17 @@ class FilterTag(xapian.ExpandDecider):
34 """ 35 """
35 Return true if the term is a tag, else false. 36 Return true if the term is a tag, else false.
36 """ 37 """
37 - return term[:2] == "XT" 38 + return term.startswith("XT")
  39 +
  40 +class FilterDescription(xapian.ExpandDecider):
  41 + """
  42 + Extend xapian.ExpandDecider to consider only package description terms.
  43 + """
  44 + def __call__(self, term):
  45 + """
  46 + Return true if the term is a tag, else false.
  47 + """
  48 + return (term.islower())
38 49
39 class DemographicProfile(Singleton): 50 class DemographicProfile(Singleton):
40 def __init__(self): 51 def __init__(self):
@@ -63,57 +74,83 @@ class User: @@ -63,57 +74,83 @@ class User:
63 """ 74 """
64 Define a user of a recommender. 75 Define a user of a recommender.
65 """ 76 """
66 - def __init__(self,item_score,user_id=0,profiles_set=0): 77 + def __init__(self,item_score,user_id=0,demo_profiles_set=0):
67 """ 78 """
68 - Set initial user attributes. If no user_id was passed as parameter, a  
69 - random md5-hash is generated for that purpose. If the demographic  
70 - profile was not defined, it defaults to 'desktop' 79 + Set initial user attributes. pkg_profile gets the whole set of items,
  80 + a random user_id is set if none was provided and the demographic
  81 + profile defaults to 'desktop'.
71 """ 82 """
72 self.item_score = item_score 83 self.item_score = item_score
  84 + self.pkg_profile = self.items()
  85 +
73 if user_id: 86 if user_id:
74 self.id = user_id 87 self.id = user_id
75 else: 88 else:
76 random.seed() 89 random.seed()
77 self.id = random.getrandbits(128) 90 self.id = random.getrandbits(128)
78 - self.pkg_profile = self.item_score.keys()  
79 - if not profiles_set: 91 +
  92 + if not demo_profiles_set:
80 profiles_set = set(["desktop"]) 93 profiles_set = set(["desktop"])
81 self.set_demographic_profile(profiles_set) 94 self.set_demographic_profile(profiles_set)
82 95
  96 + def items(self):
  97 + """
  98 + Return the set of user items.
  99 + """
  100 + return self.item_score.keys()
  101 +
83 def set_demographic_profile(self,profiles_set): 102 def set_demographic_profile(self,profiles_set):
  103 + """
  104 + Set demographic profle based on labels in 'profiles_set'.
  105 + """
84 self.demographic_profile = DemographicProfile()(profiles_set) 106 self.demographic_profile = DemographicProfile()(profiles_set)
85 107
86 - def items(self): 108 + def profile(self,items_repository,content,size):
87 """ 109 """
88 - Return the set of user items. 110 + Get user profile for a specific type of content: packages tags,
  111 + description or both (full_profile)
  112 + """
  113 + if content == "tag": return self.tag_profile(items_repository,size)
  114 + if content == "desc": return self.desc_profile(items_repository,size)
  115 + if content == "full": return self.full_profile(items_repository,size)
  116 +
  117 + def tag_profile(self,items_repository,size):
  118 + """
  119 + Return most relevant tags for a list of packages.
89 """ 120 """
90 - return set(self.item_score.keys())  
91 -  
92 - def axi_tag_profile(self,apt_xapian_index,profile_size):  
93 - """  
94 - Return most relevant tags for a list of packages based on axi.  
95 - """  
96 - terms = ["XP"+item for item in self.pkg_profile]  
97 - query = xapian.Query(xapian.Query.OP_OR, terms)  
98 - enquire = xapian.Enquire(apt_xapian_index)  
99 - enquire.set_query(query)  
100 - rset = xapian.RSet()  
101 - for m in enquire.get_mset(0,apt_xapian_index.get_doccount()):  
102 - rset.add_document(m.docid)  
103 - # statistically good differentiators between relevant and non-relevant  
104 - eset = enquire.get_eset(profile_size, rset, FilterTag())  
105 - profile = []  
106 - for res in eset:  
107 - profile.append(res.term)  
108 - logging.debug("%.2f %s" % (res.weight,res.term.lstrip("XT"))) 121 + enquire = xapian.Enquire(items_repository)
  122 + matches = data.axi_search_pkgs(items_repository,self.pkg_profile)
  123 + rset_packages = xapian.RSet()
  124 + for m in matches:
  125 + rset_packages.add_document(m.docid)
  126 + # statistically good differentiators
  127 + eset_tags = enquire.get_eset(size, rset_packages, FilterTag())
  128 + profile = [res.term for res in eset_tags]
109 return profile 129 return profile
110 130
111 - #def txi_tag_profile(self,tags_xapian_index,profile_size):  
112 - # """  
113 - # Return most relevant tags for a list of packages based on tags index.  
114 - # """  
115 - # return tags_xapian_index.relevant_tags_from_db(self.pkg_profile,  
116 - # profile_size) 131 + def desc_profile(self,items_repository,size):
  132 + """
  133 + Return most relevant keywords for a list of packages based on their
  134 + text descriptions.
  135 + """
  136 + enquire = xapian.Enquire(items_repository)
  137 + matches = data.axi_search_pkgs(items_repository,self.pkg_profile)
  138 + rset_packages = xapian.RSet()
  139 + for m in matches:
  140 + rset_packages.add_document(m.docid)
  141 + eset_keywords = enquire.get_eset(size, rset_packages,
  142 + FilterDescription())
  143 + profile = [res.term for res in eset_keywords]
  144 + return profile
  145 +
  146 + def full_profile(self,items_repository,size):
  147 + """
  148 + Return most relevant tags and keywords for a list of packages based
  149 + their tags and descriptions.
  150 + """
  151 + tag_profile = self.tag_profile(items_repository,size)[:size/2]
  152 + desc_profile = self.desc_profile(items_repository,size)[:size/2]
  153 + return tag_profile+desc_profile
117 154
118 def maximal_pkg_profile(self): 155 def maximal_pkg_profile(self):
119 """ 156 """
@@ -137,7 +174,7 @@ class User: @@ -137,7 +174,7 @@ class User:
137 profile_size = len(self.pkg_profile) 174 profile_size = len(self.pkg_profile)
138 logging.info("Reduced packages profile size from %d to %d." % 175 logging.info("Reduced packages profile size from %d to %d." %
139 (old_profile_size, profile_size)) 176 (old_profile_size, profile_size))
140 - return set(self.pkg_profile) 177 + return self.pkg_profile
141 178
142 class LocalSystem(User): 179 class LocalSystem(User):
143 """ 180 """