Commit 8503780dbddd8c15b311ddf503b63794923e39cf
1 parent
b2ea1ba9
Exists in
master
and in
1 other branch
data.py: implmented new class SampleAptXapianIndex to perform as data source
for tests and new search functions to retrieve packages and tags directly from apt-xapian-index; user.py: new user profile methods are based on the kind of content: tag, description or both; Implemented more tests for user.py and cleaned source-code from deprecated classes and functions which used debtags_db;
Showing
3 changed files
with
143 additions
and
188 deletions
Show diff stats
src/data.py
@@ -35,29 +35,44 @@ from singleton import Singleton | @@ -35,29 +35,44 @@ from singleton import Singleton | ||
35 | import cluster | 35 | import cluster |
36 | from dissimilarity import * | 36 | from dissimilarity import * |
37 | 37 | ||
38 | -#class Item: | ||
39 | -# """ | ||
40 | -# Generic item definition. | ||
41 | -# """ | ||
42 | -# | ||
43 | -#class Package(Item): | ||
44 | -# """ | ||
45 | -# Definition of a GNU/Linux application as a recommender item. | ||
46 | -# """ | ||
47 | -# def __init__(self,package_name): | ||
48 | -# """ | ||
49 | -# Set initial attributes. | ||
50 | -# """ | ||
51 | -# self.package_name = package_name | ||
52 | -# | ||
53 | -#def normalize_tags(string): | ||
54 | -# """ | ||
55 | -# Substitute string characters : by _ and - by '. | ||
56 | -# Examples: | ||
57 | -# admin::package-management -> admin__package'management | ||
58 | -# implemented-in::c++ -> implemented-in__c++ | ||
59 | -# """ | ||
60 | -# return string.replace(':','_').replace('-','\'') | 38 | +def axi_search_pkgs(axi,pkgs_list): |
39 | + terms = ["XP"+item for item in pkgs_list] | ||
40 | + query = xapian.Query(xapian.Query.OP_OR, terms) | ||
41 | + enquire = xapian.Enquire(axi) | ||
42 | + enquire.set_query(query) | ||
43 | + matches = enquire.get_mset(0,axi.get_doccount()) | ||
44 | + return matches | ||
45 | + | ||
46 | +def axi_search_pkg_tags(axi,pkg): | ||
47 | + query = xapian.Query(xapian.Query.OP_OR, "XP"+pkg) | ||
48 | + enquire = xapian.Enquire(axi) | ||
49 | + enquire.set_query(query) | ||
50 | + matches = enquire.get_mset(0,1) | ||
51 | + for m in matches: | ||
52 | + tags = [term.term for term in axi.get_document(m.docid).termlist() if | ||
53 | + term.term.startswith("XT")] | ||
54 | + return tags | ||
55 | + | ||
56 | +class SampleAptXapianIndex(xapian.WritableDatabase): | ||
57 | + """ | ||
58 | + Sample data source for packages information, mainly useful for tests. | ||
59 | + """ | ||
60 | + def __init__(self,pkgs_list,axi): | ||
61 | + xapian.WritableDatabase.__init__(self,".sample_axi", | ||
62 | + xapian.DB_CREATE_OR_OVERWRITE) | ||
63 | + sample = axi_search_pkgs(axi,pkgs_list) | ||
64 | + self.all_docs = [] | ||
65 | + for package in sample: | ||
66 | + doc_id = self.add_document(axi.get_document(package.docid)) | ||
67 | + self.all_docs.append(doc_id) | ||
68 | + | ||
69 | + def _print(self): | ||
70 | + print "---" | ||
71 | + print xapian.WritableDatabase.__repr__(self) | ||
72 | + print "---" | ||
73 | + for doc_id in self.all_docs: | ||
74 | + print [term.term for term in self.get_document(doc_id).termlist()] | ||
75 | + print "---" | ||
61 | 76 | ||
62 | #[FIXME] get pkg tags from axi and remove load_debtags_db method | 77 | #[FIXME] get pkg tags from axi and remove load_debtags_db method |
63 | def load_debtags_db(db_path): | 78 | def load_debtags_db(db_path): |
@@ -75,106 +90,6 @@ def load_debtags_db(db_path): | @@ -75,106 +90,6 @@ def load_debtags_db(db_path): | ||
75 | logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | 90 | logging.error("Could not load DebtagsDB from '%s'." % self.db_path) |
76 | raise Error | 91 | raise Error |
77 | 92 | ||
78 | -#class TagsXapianIndex(xapian.WritableDatabase,Singleton): | ||
79 | -# """ | ||
80 | -# Data source for tags info defined as a singleton xapian database. | ||
81 | -# """ | ||
82 | -# def __init__(self,cfg): | ||
83 | -# """ | ||
84 | -# Set initial attributes. | ||
85 | -# """ | ||
86 | -# self.path = os.path.expanduser(cfg.tags_index) | ||
87 | -# self.db_path = os.path.expanduser(cfg.tags_db) | ||
88 | -# self.debtags_db = debtags.DB() | ||
89 | -# try: | ||
90 | -# db_file = open(self.db_path) | ||
91 | -# except IOError: | ||
92 | -# logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | ||
93 | -# raise Error | ||
94 | -# md5 = hashlib.md5() | ||
95 | -# md5.update(db_file.read()) | ||
96 | -# self.db_md5 = md5.hexdigest() | ||
97 | -# db_file.close() | ||
98 | -# self.load_index(cfg.reindex) | ||
99 | -# | ||
100 | -## def load_db(self): | ||
101 | -## """ | ||
102 | -## Load debtags database from the source file. | ||
103 | -## """ | ||
104 | -## tag_filter = re.compile(r"^special::.+$|^.+::TODO$") | ||
105 | -## try: | ||
106 | -## db_file = open(self.db_path, "r") | ||
107 | -## self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) | ||
108 | -## db_file.close() | ||
109 | -## except: | ||
110 | -## logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | ||
111 | -## raise Error | ||
112 | -# | ||
113 | -# def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): | ||
114 | -# """ | ||
115 | -# Return most relevant tags considering a list of packages. | ||
116 | -# """ | ||
117 | -# if not self.debtags_db.package_count(): | ||
118 | -# #print "index vazio" | ||
119 | -# self.debtags_db = load_debtags_db(self.db_path) | ||
120 | -# relevant_db = self.debtags_db.choose_packages(pkgs_list) | ||
121 | -# relevance_index = debtags.relevance_index_function(self.debtags_db, | ||
122 | -# relevant_db) | ||
123 | -# sorted_relevant_tags = sorted(relevant_db.iter_tags(), | ||
124 | -# lambda a, b: cmp(relevance_index(a), | ||
125 | -# relevance_index(b))) | ||
126 | -# return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:])) | ||
127 | -# | ||
128 | -# def load_index(self,reindex): | ||
129 | -# """ | ||
130 | -# Load an existing debtags index. | ||
131 | -# """ | ||
132 | -# if not reindex: | ||
133 | -# try: | ||
134 | -# logging.info("Opening existing debtags xapian index at \'%s\'" | ||
135 | -# % self.path) | ||
136 | -# xapian.Database.__init__(self,self.path) | ||
137 | -# md5 = self.get_metadata("md5") | ||
138 | -# if not md5 == self.db_md5: | ||
139 | -# logging.info("Index must be updated.") | ||
140 | -# reindex = 1 | ||
141 | -# except xapian.DatabaseError: | ||
142 | -# logging.info("Could not open debtags index.") | ||
143 | -# reindex =1 | ||
144 | -# | ||
145 | -# if reindex: | ||
146 | -# self.new_index() | ||
147 | -# | ||
148 | -# def new_index(self): | ||
149 | -# """ | ||
150 | -# Create a xapian index for debtags info based on 'debtags_db' and | ||
151 | -# place it at 'self.path'. | ||
152 | -# """ | ||
153 | -# if not os.path.exists(self.path): | ||
154 | -# os.makedirs(self.path) | ||
155 | -# | ||
156 | -# try: | ||
157 | -# logging.info("Indexing debtags info from \'%s\'" % | ||
158 | -# self.db_path) | ||
159 | -# logging.info("Creating new xapian index at \'%s\'" % | ||
160 | -# self.path) | ||
161 | -# xapian.WritableDatabase.__init__(self,self.path, | ||
162 | -# xapian.DB_CREATE_OR_OVERWRITE) | ||
163 | -# except xapian.DatabaseError: | ||
164 | -# logging.critical("Could not create xapian index.") | ||
165 | -# raise Error | ||
166 | -# | ||
167 | -# self.debtags_db = load_debtags_db(self.db_path) | ||
168 | -# self.set_metadata("md5",self.db_md5) | ||
169 | -# | ||
170 | -# for pkg,tags in self.debtags_db.iter_packages_tags(): | ||
171 | -# doc = xapian.Document() | ||
172 | -# doc.set_data(pkg) | ||
173 | -# for tag in tags: | ||
174 | -# doc.add_term(normalize_tags(tag)) | ||
175 | -# doc_id = self.add_document(doc) | ||
176 | -# logging.debug("Debtags Xapian: Indexing doc %d",doc_id) | ||
177 | - | ||
178 | class PopconXapianIndex(xapian.WritableDatabase,Singleton): | 93 | class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
179 | """ | 94 | """ |
180 | Data source for popcon submissions defined as a singleton xapian database. | 95 | Data source for popcon submissions defined as a singleton xapian database. |
src/tests/user_tests.py
@@ -36,9 +36,12 @@ class UserTests(unittest2.TestCase): | @@ -36,9 +36,12 @@ class UserTests(unittest2.TestCase): | ||
36 | @classmethod | 36 | @classmethod |
37 | def setUpClass(self): | 37 | def setUpClass(self): |
38 | cfg = Config() | 38 | cfg = Config() |
39 | - #self.axi = xapian.Database(cfg.axi) | 39 | + self.axi = xapian.Database(cfg.axi) |
40 | + sample_packages = ["gimp","aaphoto","eog","emacs","dia","ferret", | ||
41 | + "festival","file","inkscape","xpdf"] | ||
42 | + self.sample_axi = SampleAptXapianIndex(sample_packages,self.axi) | ||
40 | self.user = User({"gimp":1,"aaphoto":1,"eog":1,"emacs":1}) | 43 | self.user = User({"gimp":1,"aaphoto":1,"eog":1,"emacs":1}) |
41 | - self.pxi = PkgXapianIndex("package-xapian-index") | 44 | + #self.sample_axi._print() |
42 | 45 | ||
43 | def test_hash(self): | 46 | def test_hash(self): |
44 | new_user = User(dict()) | 47 | new_user = User(dict()) |
@@ -100,34 +103,34 @@ class UserTests(unittest2.TestCase): | @@ -100,34 +103,34 @@ class UserTests(unittest2.TestCase): | ||
100 | self.assertEqual(self.user.demographic_profile,desktop_art_admin) | 103 | self.assertEqual(self.user.demographic_profile,desktop_art_admin) |
101 | 104 | ||
102 | def test_items(self): | 105 | def test_items(self): |
103 | - self.assertEqual(self.user.items(),set(["gimp","aaphoto","eog","emacs"])) | ||
104 | - | ||
105 | - def test_axi_tag_profile(self): | ||
106 | - package_terms = ["XP"+package for package in self.user.items()] | ||
107 | - enquire = xapian.Enquire(self.pxi) | ||
108 | - enquire.set_query(xapian.Query(xapian.Query.OP_OR,package_terms)) | ||
109 | - user_packages = enquire.get_mset(0, self.pxi.get_doccount(), None, None) | ||
110 | - tag_terms = [] | ||
111 | - for p in user_packages: | ||
112 | - tag_terms = tag_terms + [x.term for x in p.document.termlist() \ | ||
113 | - if x.term.startswith("XT")] | ||
114 | - relevant_count = dict([(tag,tag_terms.count(tag)) \ | ||
115 | - for tag in set(tag_terms)]) | ||
116 | - #rank = {} | ||
117 | - #non_relevant_count = dict() | ||
118 | - #for tag,count in relevant_count.items(): | ||
119 | - # non_relevant_count[tag] = self.pxi.get_termfreq(tag)-count | ||
120 | - # if non_relevant_count[tag]>0: | ||
121 | - # rank[tag] = relevant_count[tag]/float(non_relevant_count[tag]) | ||
122 | - #print "relevant",relevant_count | ||
123 | - #print "non_relevant",non_relevant_count | ||
124 | - #print sorted(rank.items(), key=operator.itemgetter(1)) | ||
125 | - #[FIXME] get ths value based on real ranking | ||
126 | - #print set(self.user.axi_tag_profile(self.pxi,4)) | ||
127 | - self.assertEqual(set(self.user.axi_tag_profile(self.pxi,4)), | ||
128 | - set(["XTuse::editing", "XTworks-with::image", | ||
129 | - "XTworks-with-format::png", | ||
130 | - "XTworks-with-format::jpg"])) | 106 | + self.assertEqual(set(self.user.items()), |
107 | + set(["gimp","aaphoto","eog","emacs"])) | ||
108 | + | ||
109 | + def test_profile(self): | ||
110 | + self.assertEqual(self.user.profile(self.sample_axi,"tag",10), | ||
111 | + self.user.tag_profile(self.sample_axi,10)) | ||
112 | + self.assertEqual(self.user.profile(self.sample_axi,"desc",10), | ||
113 | + self.user.desc_profile(self.sample_axi,10)) | ||
114 | + self.assertEqual(self.user.profile(self.sample_axi,"full",10), | ||
115 | + self.user.full_profile(self.sample_axi,10)) | ||
116 | + | ||
117 | + def test_tag_profile(self): | ||
118 | + self.assertEqual(self.user.tag_profile(self.sample_axi,10), | ||
119 | + ['XTuse::editing', 'XTworks-with::image:raster', | ||
120 | + 'XTworks-with-format::png', 'XTworks-with-format::jpg', | ||
121 | + 'XTworks-with::image','XTimplemented-in::c', | ||
122 | + 'XTsuite::gnome', 'XTsuite::emacs', | ||
123 | + 'XTrole::metapackage', 'XTdevel::editor']) | ||
124 | + | ||
125 | + def test_desc_profile(self): | ||
126 | + self.assertEqual(self.user.desc_profile(self.sample_axi,10), | ||
127 | + ['image', 'the', 'which', 'manipulation', 'program', | ||
128 | + 'input', 'a', 'gnu', 'images', 'this']) | ||
129 | + | ||
130 | + def test_full_profile(self): | ||
131 | + self.assertEqual(self.user.full_profile(self.sample_axi,10), | ||
132 | + (self.user.tag_profile(self.sample_axi,5)+ | ||
133 | + self.user.desc_profile(self.sample_axi,5))) | ||
131 | 134 | ||
132 | def test_maximal_pkg_profile(self): | 135 | def test_maximal_pkg_profile(self): |
133 | old_pkg_profile = self.user.items() | 136 | old_pkg_profile = self.user.items() |
src/user.py
@@ -25,6 +25,7 @@ import xapian | @@ -25,6 +25,7 @@ import xapian | ||
25 | import logging | 25 | import logging |
26 | import apt | 26 | import apt |
27 | from singleton import Singleton | 27 | from singleton import Singleton |
28 | +import data | ||
28 | 29 | ||
29 | class FilterTag(xapian.ExpandDecider): | 30 | class FilterTag(xapian.ExpandDecider): |
30 | """ | 31 | """ |
@@ -34,7 +35,17 @@ class FilterTag(xapian.ExpandDecider): | @@ -34,7 +35,17 @@ class FilterTag(xapian.ExpandDecider): | ||
34 | """ | 35 | """ |
35 | Return true if the term is a tag, else false. | 36 | Return true if the term is a tag, else false. |
36 | """ | 37 | """ |
37 | - return term[:2] == "XT" | 38 | + return term.startswith("XT") |
39 | + | ||
40 | +class FilterDescription(xapian.ExpandDecider): | ||
41 | + """ | ||
42 | + Extend xapian.ExpandDecider to consider only package description terms. | ||
43 | + """ | ||
44 | + def __call__(self, term): | ||
45 | + """ | ||
46 | + Return true if the term is a tag, else false. | ||
47 | + """ | ||
48 | + return (term.islower()) | ||
38 | 49 | ||
39 | class DemographicProfile(Singleton): | 50 | class DemographicProfile(Singleton): |
40 | def __init__(self): | 51 | def __init__(self): |
@@ -63,57 +74,83 @@ class User: | @@ -63,57 +74,83 @@ class User: | ||
63 | """ | 74 | """ |
64 | Define a user of a recommender. | 75 | Define a user of a recommender. |
65 | """ | 76 | """ |
66 | - def __init__(self,item_score,user_id=0,profiles_set=0): | 77 | + def __init__(self,item_score,user_id=0,demo_profiles_set=0): |
67 | """ | 78 | """ |
68 | - Set initial user attributes. If no user_id was passed as parameter, a | ||
69 | - random md5-hash is generated for that purpose. If the demographic | ||
70 | - profile was not defined, it defaults to 'desktop' | 79 | + Set initial user attributes. pkg_profile gets the whole set of items, |
80 | + a random user_id is set if none was provided and the demographic | ||
81 | + profile defaults to 'desktop'. | ||
71 | """ | 82 | """ |
72 | self.item_score = item_score | 83 | self.item_score = item_score |
84 | + self.pkg_profile = self.items() | ||
85 | + | ||
73 | if user_id: | 86 | if user_id: |
74 | self.id = user_id | 87 | self.id = user_id |
75 | else: | 88 | else: |
76 | random.seed() | 89 | random.seed() |
77 | self.id = random.getrandbits(128) | 90 | self.id = random.getrandbits(128) |
78 | - self.pkg_profile = self.item_score.keys() | ||
79 | - if not profiles_set: | 91 | + |
92 | + if not demo_profiles_set: | ||
80 | profiles_set = set(["desktop"]) | 93 | profiles_set = set(["desktop"]) |
81 | self.set_demographic_profile(profiles_set) | 94 | self.set_demographic_profile(profiles_set) |
82 | 95 | ||
96 | + def items(self): | ||
97 | + """ | ||
98 | + Return the set of user items. | ||
99 | + """ | ||
100 | + return self.item_score.keys() | ||
101 | + | ||
83 | def set_demographic_profile(self,profiles_set): | 102 | def set_demographic_profile(self,profiles_set): |
103 | + """ | ||
104 | + Set demographic profle based on labels in 'profiles_set'. | ||
105 | + """ | ||
84 | self.demographic_profile = DemographicProfile()(profiles_set) | 106 | self.demographic_profile = DemographicProfile()(profiles_set) |
85 | 107 | ||
86 | - def items(self): | 108 | + def profile(self,items_repository,content,size): |
87 | """ | 109 | """ |
88 | - Return the set of user items. | 110 | + Get user profile for a specific type of content: packages tags, |
111 | + description or both (full_profile) | ||
112 | + """ | ||
113 | + if content == "tag": return self.tag_profile(items_repository,size) | ||
114 | + if content == "desc": return self.desc_profile(items_repository,size) | ||
115 | + if content == "full": return self.full_profile(items_repository,size) | ||
116 | + | ||
117 | + def tag_profile(self,items_repository,size): | ||
118 | + """ | ||
119 | + Return most relevant tags for a list of packages. | ||
89 | """ | 120 | """ |
90 | - return set(self.item_score.keys()) | ||
91 | - | ||
92 | - def axi_tag_profile(self,apt_xapian_index,profile_size): | ||
93 | - """ | ||
94 | - Return most relevant tags for a list of packages based on axi. | ||
95 | - """ | ||
96 | - terms = ["XP"+item for item in self.pkg_profile] | ||
97 | - query = xapian.Query(xapian.Query.OP_OR, terms) | ||
98 | - enquire = xapian.Enquire(apt_xapian_index) | ||
99 | - enquire.set_query(query) | ||
100 | - rset = xapian.RSet() | ||
101 | - for m in enquire.get_mset(0,apt_xapian_index.get_doccount()): | ||
102 | - rset.add_document(m.docid) | ||
103 | - # statistically good differentiators between relevant and non-relevant | ||
104 | - eset = enquire.get_eset(profile_size, rset, FilterTag()) | ||
105 | - profile = [] | ||
106 | - for res in eset: | ||
107 | - profile.append(res.term) | ||
108 | - logging.debug("%.2f %s" % (res.weight,res.term.lstrip("XT"))) | 121 | + enquire = xapian.Enquire(items_repository) |
122 | + matches = data.axi_search_pkgs(items_repository,self.pkg_profile) | ||
123 | + rset_packages = xapian.RSet() | ||
124 | + for m in matches: | ||
125 | + rset_packages.add_document(m.docid) | ||
126 | + # statistically good differentiators | ||
127 | + eset_tags = enquire.get_eset(size, rset_packages, FilterTag()) | ||
128 | + profile = [res.term for res in eset_tags] | ||
109 | return profile | 129 | return profile |
110 | 130 | ||
111 | - #def txi_tag_profile(self,tags_xapian_index,profile_size): | ||
112 | - # """ | ||
113 | - # Return most relevant tags for a list of packages based on tags index. | ||
114 | - # """ | ||
115 | - # return tags_xapian_index.relevant_tags_from_db(self.pkg_profile, | ||
116 | - # profile_size) | 131 | + def desc_profile(self,items_repository,size): |
132 | + """ | ||
133 | + Return most relevant keywords for a list of packages based on their | ||
134 | + text descriptions. | ||
135 | + """ | ||
136 | + enquire = xapian.Enquire(items_repository) | ||
137 | + matches = data.axi_search_pkgs(items_repository,self.pkg_profile) | ||
138 | + rset_packages = xapian.RSet() | ||
139 | + for m in matches: | ||
140 | + rset_packages.add_document(m.docid) | ||
141 | + eset_keywords = enquire.get_eset(size, rset_packages, | ||
142 | + FilterDescription()) | ||
143 | + profile = [res.term for res in eset_keywords] | ||
144 | + return profile | ||
145 | + | ||
146 | + def full_profile(self,items_repository,size): | ||
147 | + """ | ||
148 | + Return most relevant tags and keywords for a list of packages based | ||
149 | + their tags and descriptions. | ||
150 | + """ | ||
151 | + tag_profile = self.tag_profile(items_repository,size)[:size/2] | ||
152 | + desc_profile = self.desc_profile(items_repository,size)[:size/2] | ||
153 | + return tag_profile+desc_profile | ||
117 | 154 | ||
118 | def maximal_pkg_profile(self): | 155 | def maximal_pkg_profile(self): |
119 | """ | 156 | """ |
@@ -137,7 +174,7 @@ class User: | @@ -137,7 +174,7 @@ class User: | ||
137 | profile_size = len(self.pkg_profile) | 174 | profile_size = len(self.pkg_profile) |
138 | logging.info("Reduced packages profile size from %d to %d." % | 175 | logging.info("Reduced packages profile size from %d to %d." % |
139 | (old_profile_size, profile_size)) | 176 | (old_profile_size, profile_size)) |
140 | - return set(self.pkg_profile) | 177 | + return self.pkg_profile |
141 | 178 | ||
142 | class LocalSystem(User): | 179 | class LocalSystem(User): |
143 | """ | 180 | """ |