Commit 8503780dbddd8c15b311ddf503b63794923e39cf
1 parent
b2ea1ba9
Exists in
master
and in
1 other branch
data.py: implmented new class SampleAptXapianIndex to perform as data source
for tests and new search functions to retrieve packages and tags directly from apt-xapian-index; user.py: new user profile methods are based on the kind of content: tag, description or both; Implemented more tests for user.py and cleaned source-code from deprecated classes and functions which used debtags_db;
Showing
3 changed files
with
143 additions
and
188 deletions
Show diff stats
src/data.py
... | ... | @@ -35,29 +35,44 @@ from singleton import Singleton |
35 | 35 | import cluster |
36 | 36 | from dissimilarity import * |
37 | 37 | |
38 | -#class Item: | |
39 | -# """ | |
40 | -# Generic item definition. | |
41 | -# """ | |
42 | -# | |
43 | -#class Package(Item): | |
44 | -# """ | |
45 | -# Definition of a GNU/Linux application as a recommender item. | |
46 | -# """ | |
47 | -# def __init__(self,package_name): | |
48 | -# """ | |
49 | -# Set initial attributes. | |
50 | -# """ | |
51 | -# self.package_name = package_name | |
52 | -# | |
53 | -#def normalize_tags(string): | |
54 | -# """ | |
55 | -# Substitute string characters : by _ and - by '. | |
56 | -# Examples: | |
57 | -# admin::package-management -> admin__package'management | |
58 | -# implemented-in::c++ -> implemented-in__c++ | |
59 | -# """ | |
60 | -# return string.replace(':','_').replace('-','\'') | |
38 | +def axi_search_pkgs(axi,pkgs_list): | |
39 | + terms = ["XP"+item for item in pkgs_list] | |
40 | + query = xapian.Query(xapian.Query.OP_OR, terms) | |
41 | + enquire = xapian.Enquire(axi) | |
42 | + enquire.set_query(query) | |
43 | + matches = enquire.get_mset(0,axi.get_doccount()) | |
44 | + return matches | |
45 | + | |
46 | +def axi_search_pkg_tags(axi,pkg): | |
47 | + query = xapian.Query(xapian.Query.OP_OR, "XP"+pkg) | |
48 | + enquire = xapian.Enquire(axi) | |
49 | + enquire.set_query(query) | |
50 | + matches = enquire.get_mset(0,1) | |
51 | + for m in matches: | |
52 | + tags = [term.term for term in axi.get_document(m.docid).termlist() if | |
53 | + term.term.startswith("XT")] | |
54 | + return tags | |
55 | + | |
56 | +class SampleAptXapianIndex(xapian.WritableDatabase): | |
57 | + """ | |
58 | + Sample data source for packages information, mainly useful for tests. | |
59 | + """ | |
60 | + def __init__(self,pkgs_list,axi): | |
61 | + xapian.WritableDatabase.__init__(self,".sample_axi", | |
62 | + xapian.DB_CREATE_OR_OVERWRITE) | |
63 | + sample = axi_search_pkgs(axi,pkgs_list) | |
64 | + self.all_docs = [] | |
65 | + for package in sample: | |
66 | + doc_id = self.add_document(axi.get_document(package.docid)) | |
67 | + self.all_docs.append(doc_id) | |
68 | + | |
69 | + def _print(self): | |
70 | + print "---" | |
71 | + print xapian.WritableDatabase.__repr__(self) | |
72 | + print "---" | |
73 | + for doc_id in self.all_docs: | |
74 | + print [term.term for term in self.get_document(doc_id).termlist()] | |
75 | + print "---" | |
61 | 76 | |
62 | 77 | #[FIXME] get pkg tags from axi and remove load_debtags_db method |
63 | 78 | def load_debtags_db(db_path): |
... | ... | @@ -75,106 +90,6 @@ def load_debtags_db(db_path): |
75 | 90 | logging.error("Could not load DebtagsDB from '%s'." % self.db_path) |
76 | 91 | raise Error |
77 | 92 | |
78 | -#class TagsXapianIndex(xapian.WritableDatabase,Singleton): | |
79 | -# """ | |
80 | -# Data source for tags info defined as a singleton xapian database. | |
81 | -# """ | |
82 | -# def __init__(self,cfg): | |
83 | -# """ | |
84 | -# Set initial attributes. | |
85 | -# """ | |
86 | -# self.path = os.path.expanduser(cfg.tags_index) | |
87 | -# self.db_path = os.path.expanduser(cfg.tags_db) | |
88 | -# self.debtags_db = debtags.DB() | |
89 | -# try: | |
90 | -# db_file = open(self.db_path) | |
91 | -# except IOError: | |
92 | -# logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | |
93 | -# raise Error | |
94 | -# md5 = hashlib.md5() | |
95 | -# md5.update(db_file.read()) | |
96 | -# self.db_md5 = md5.hexdigest() | |
97 | -# db_file.close() | |
98 | -# self.load_index(cfg.reindex) | |
99 | -# | |
100 | -## def load_db(self): | |
101 | -## """ | |
102 | -## Load debtags database from the source file. | |
103 | -## """ | |
104 | -## tag_filter = re.compile(r"^special::.+$|^.+::TODO$") | |
105 | -## try: | |
106 | -## db_file = open(self.db_path, "r") | |
107 | -## self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) | |
108 | -## db_file.close() | |
109 | -## except: | |
110 | -## logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | |
111 | -## raise Error | |
112 | -# | |
113 | -# def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): | |
114 | -# """ | |
115 | -# Return most relevant tags considering a list of packages. | |
116 | -# """ | |
117 | -# if not self.debtags_db.package_count(): | |
118 | -# #print "index vazio" | |
119 | -# self.debtags_db = load_debtags_db(self.db_path) | |
120 | -# relevant_db = self.debtags_db.choose_packages(pkgs_list) | |
121 | -# relevance_index = debtags.relevance_index_function(self.debtags_db, | |
122 | -# relevant_db) | |
123 | -# sorted_relevant_tags = sorted(relevant_db.iter_tags(), | |
124 | -# lambda a, b: cmp(relevance_index(a), | |
125 | -# relevance_index(b))) | |
126 | -# return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:])) | |
127 | -# | |
128 | -# def load_index(self,reindex): | |
129 | -# """ | |
130 | -# Load an existing debtags index. | |
131 | -# """ | |
132 | -# if not reindex: | |
133 | -# try: | |
134 | -# logging.info("Opening existing debtags xapian index at \'%s\'" | |
135 | -# % self.path) | |
136 | -# xapian.Database.__init__(self,self.path) | |
137 | -# md5 = self.get_metadata("md5") | |
138 | -# if not md5 == self.db_md5: | |
139 | -# logging.info("Index must be updated.") | |
140 | -# reindex = 1 | |
141 | -# except xapian.DatabaseError: | |
142 | -# logging.info("Could not open debtags index.") | |
143 | -# reindex =1 | |
144 | -# | |
145 | -# if reindex: | |
146 | -# self.new_index() | |
147 | -# | |
148 | -# def new_index(self): | |
149 | -# """ | |
150 | -# Create a xapian index for debtags info based on 'debtags_db' and | |
151 | -# place it at 'self.path'. | |
152 | -# """ | |
153 | -# if not os.path.exists(self.path): | |
154 | -# os.makedirs(self.path) | |
155 | -# | |
156 | -# try: | |
157 | -# logging.info("Indexing debtags info from \'%s\'" % | |
158 | -# self.db_path) | |
159 | -# logging.info("Creating new xapian index at \'%s\'" % | |
160 | -# self.path) | |
161 | -# xapian.WritableDatabase.__init__(self,self.path, | |
162 | -# xapian.DB_CREATE_OR_OVERWRITE) | |
163 | -# except xapian.DatabaseError: | |
164 | -# logging.critical("Could not create xapian index.") | |
165 | -# raise Error | |
166 | -# | |
167 | -# self.debtags_db = load_debtags_db(self.db_path) | |
168 | -# self.set_metadata("md5",self.db_md5) | |
169 | -# | |
170 | -# for pkg,tags in self.debtags_db.iter_packages_tags(): | |
171 | -# doc = xapian.Document() | |
172 | -# doc.set_data(pkg) | |
173 | -# for tag in tags: | |
174 | -# doc.add_term(normalize_tags(tag)) | |
175 | -# doc_id = self.add_document(doc) | |
176 | -# logging.debug("Debtags Xapian: Indexing doc %d",doc_id) | |
177 | - | |
178 | 93 | class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
179 | 94 | """ |
180 | 95 | Data source for popcon submissions defined as a singleton xapian database. | ... | ... |
src/tests/user_tests.py
... | ... | @@ -36,9 +36,12 @@ class UserTests(unittest2.TestCase): |
36 | 36 | @classmethod |
37 | 37 | def setUpClass(self): |
38 | 38 | cfg = Config() |
39 | - #self.axi = xapian.Database(cfg.axi) | |
39 | + self.axi = xapian.Database(cfg.axi) | |
40 | + sample_packages = ["gimp","aaphoto","eog","emacs","dia","ferret", | |
41 | + "festival","file","inkscape","xpdf"] | |
42 | + self.sample_axi = SampleAptXapianIndex(sample_packages,self.axi) | |
40 | 43 | self.user = User({"gimp":1,"aaphoto":1,"eog":1,"emacs":1}) |
41 | - self.pxi = PkgXapianIndex("package-xapian-index") | |
44 | + #self.sample_axi._print() | |
42 | 45 | |
43 | 46 | def test_hash(self): |
44 | 47 | new_user = User(dict()) |
... | ... | @@ -100,34 +103,34 @@ class UserTests(unittest2.TestCase): |
100 | 103 | self.assertEqual(self.user.demographic_profile,desktop_art_admin) |
101 | 104 | |
102 | 105 | def test_items(self): |
103 | - self.assertEqual(self.user.items(),set(["gimp","aaphoto","eog","emacs"])) | |
104 | - | |
105 | - def test_axi_tag_profile(self): | |
106 | - package_terms = ["XP"+package for package in self.user.items()] | |
107 | - enquire = xapian.Enquire(self.pxi) | |
108 | - enquire.set_query(xapian.Query(xapian.Query.OP_OR,package_terms)) | |
109 | - user_packages = enquire.get_mset(0, self.pxi.get_doccount(), None, None) | |
110 | - tag_terms = [] | |
111 | - for p in user_packages: | |
112 | - tag_terms = tag_terms + [x.term for x in p.document.termlist() \ | |
113 | - if x.term.startswith("XT")] | |
114 | - relevant_count = dict([(tag,tag_terms.count(tag)) \ | |
115 | - for tag in set(tag_terms)]) | |
116 | - #rank = {} | |
117 | - #non_relevant_count = dict() | |
118 | - #for tag,count in relevant_count.items(): | |
119 | - # non_relevant_count[tag] = self.pxi.get_termfreq(tag)-count | |
120 | - # if non_relevant_count[tag]>0: | |
121 | - # rank[tag] = relevant_count[tag]/float(non_relevant_count[tag]) | |
122 | - #print "relevant",relevant_count | |
123 | - #print "non_relevant",non_relevant_count | |
124 | - #print sorted(rank.items(), key=operator.itemgetter(1)) | |
125 | - #[FIXME] get ths value based on real ranking | |
126 | - #print set(self.user.axi_tag_profile(self.pxi,4)) | |
127 | - self.assertEqual(set(self.user.axi_tag_profile(self.pxi,4)), | |
128 | - set(["XTuse::editing", "XTworks-with::image", | |
129 | - "XTworks-with-format::png", | |
130 | - "XTworks-with-format::jpg"])) | |
106 | + self.assertEqual(set(self.user.items()), | |
107 | + set(["gimp","aaphoto","eog","emacs"])) | |
108 | + | |
109 | + def test_profile(self): | |
110 | + self.assertEqual(self.user.profile(self.sample_axi,"tag",10), | |
111 | + self.user.tag_profile(self.sample_axi,10)) | |
112 | + self.assertEqual(self.user.profile(self.sample_axi,"desc",10), | |
113 | + self.user.desc_profile(self.sample_axi,10)) | |
114 | + self.assertEqual(self.user.profile(self.sample_axi,"full",10), | |
115 | + self.user.full_profile(self.sample_axi,10)) | |
116 | + | |
117 | + def test_tag_profile(self): | |
118 | + self.assertEqual(self.user.tag_profile(self.sample_axi,10), | |
119 | + ['XTuse::editing', 'XTworks-with::image:raster', | |
120 | + 'XTworks-with-format::png', 'XTworks-with-format::jpg', | |
121 | + 'XTworks-with::image','XTimplemented-in::c', | |
122 | + 'XTsuite::gnome', 'XTsuite::emacs', | |
123 | + 'XTrole::metapackage', 'XTdevel::editor']) | |
124 | + | |
125 | + def test_desc_profile(self): | |
126 | + self.assertEqual(self.user.desc_profile(self.sample_axi,10), | |
127 | + ['image', 'the', 'which', 'manipulation', 'program', | |
128 | + 'input', 'a', 'gnu', 'images', 'this']) | |
129 | + | |
130 | + def test_full_profile(self): | |
131 | + self.assertEqual(self.user.full_profile(self.sample_axi,10), | |
132 | + (self.user.tag_profile(self.sample_axi,5)+ | |
133 | + self.user.desc_profile(self.sample_axi,5))) | |
131 | 134 | |
132 | 135 | def test_maximal_pkg_profile(self): |
133 | 136 | old_pkg_profile = self.user.items() | ... | ... |
src/user.py
... | ... | @@ -25,6 +25,7 @@ import xapian |
25 | 25 | import logging |
26 | 26 | import apt |
27 | 27 | from singleton import Singleton |
28 | +import data | |
28 | 29 | |
29 | 30 | class FilterTag(xapian.ExpandDecider): |
30 | 31 | """ |
... | ... | @@ -34,7 +35,17 @@ class FilterTag(xapian.ExpandDecider): |
34 | 35 | """ |
35 | 36 | Return true if the term is a tag, else false. |
36 | 37 | """ |
37 | - return term[:2] == "XT" | |
38 | + return term.startswith("XT") | |
39 | + | |
40 | +class FilterDescription(xapian.ExpandDecider): | |
41 | + """ | |
42 | + Extend xapian.ExpandDecider to consider only package description terms. | |
43 | + """ | |
44 | + def __call__(self, term): | |
45 | + """ | |
46 | + Return true if the term is a tag, else false. | |
47 | + """ | |
48 | + return (term.islower()) | |
38 | 49 | |
39 | 50 | class DemographicProfile(Singleton): |
40 | 51 | def __init__(self): |
... | ... | @@ -63,57 +74,83 @@ class User: |
63 | 74 | """ |
64 | 75 | Define a user of a recommender. |
65 | 76 | """ |
66 | - def __init__(self,item_score,user_id=0,profiles_set=0): | |
77 | + def __init__(self,item_score,user_id=0,demo_profiles_set=0): | |
67 | 78 | """ |
68 | - Set initial user attributes. If no user_id was passed as parameter, a | |
69 | - random md5-hash is generated for that purpose. If the demographic | |
70 | - profile was not defined, it defaults to 'desktop' | |
79 | + Set initial user attributes. pkg_profile gets the whole set of items, | |
80 | + a random user_id is set if none was provided and the demographic | |
81 | + profile defaults to 'desktop'. | |
71 | 82 | """ |
72 | 83 | self.item_score = item_score |
84 | + self.pkg_profile = self.items() | |
85 | + | |
73 | 86 | if user_id: |
74 | 87 | self.id = user_id |
75 | 88 | else: |
76 | 89 | random.seed() |
77 | 90 | self.id = random.getrandbits(128) |
78 | - self.pkg_profile = self.item_score.keys() | |
79 | - if not profiles_set: | |
91 | + | |
92 | + if not demo_profiles_set: | |
80 | 93 | profiles_set = set(["desktop"]) |
81 | 94 | self.set_demographic_profile(profiles_set) |
82 | 95 | |
96 | + def items(self): | |
97 | + """ | |
98 | + Return the set of user items. | |
99 | + """ | |
100 | + return self.item_score.keys() | |
101 | + | |
83 | 102 | def set_demographic_profile(self,profiles_set): |
103 | + """ | |
104 | + Set demographic profle based on labels in 'profiles_set'. | |
105 | + """ | |
84 | 106 | self.demographic_profile = DemographicProfile()(profiles_set) |
85 | 107 | |
86 | - def items(self): | |
108 | + def profile(self,items_repository,content,size): | |
87 | 109 | """ |
88 | - Return the set of user items. | |
110 | + Get user profile for a specific type of content: packages tags, | |
111 | + description or both (full_profile) | |
112 | + """ | |
113 | + if content == "tag": return self.tag_profile(items_repository,size) | |
114 | + if content == "desc": return self.desc_profile(items_repository,size) | |
115 | + if content == "full": return self.full_profile(items_repository,size) | |
116 | + | |
117 | + def tag_profile(self,items_repository,size): | |
118 | + """ | |
119 | + Return most relevant tags for a list of packages. | |
89 | 120 | """ |
90 | - return set(self.item_score.keys()) | |
91 | - | |
92 | - def axi_tag_profile(self,apt_xapian_index,profile_size): | |
93 | - """ | |
94 | - Return most relevant tags for a list of packages based on axi. | |
95 | - """ | |
96 | - terms = ["XP"+item for item in self.pkg_profile] | |
97 | - query = xapian.Query(xapian.Query.OP_OR, terms) | |
98 | - enquire = xapian.Enquire(apt_xapian_index) | |
99 | - enquire.set_query(query) | |
100 | - rset = xapian.RSet() | |
101 | - for m in enquire.get_mset(0,apt_xapian_index.get_doccount()): | |
102 | - rset.add_document(m.docid) | |
103 | - # statistically good differentiators between relevant and non-relevant | |
104 | - eset = enquire.get_eset(profile_size, rset, FilterTag()) | |
105 | - profile = [] | |
106 | - for res in eset: | |
107 | - profile.append(res.term) | |
108 | - logging.debug("%.2f %s" % (res.weight,res.term.lstrip("XT"))) | |
121 | + enquire = xapian.Enquire(items_repository) | |
122 | + matches = data.axi_search_pkgs(items_repository,self.pkg_profile) | |
123 | + rset_packages = xapian.RSet() | |
124 | + for m in matches: | |
125 | + rset_packages.add_document(m.docid) | |
126 | + # statistically good differentiators | |
127 | + eset_tags = enquire.get_eset(size, rset_packages, FilterTag()) | |
128 | + profile = [res.term for res in eset_tags] | |
109 | 129 | return profile |
110 | 130 | |
111 | - #def txi_tag_profile(self,tags_xapian_index,profile_size): | |
112 | - # """ | |
113 | - # Return most relevant tags for a list of packages based on tags index. | |
114 | - # """ | |
115 | - # return tags_xapian_index.relevant_tags_from_db(self.pkg_profile, | |
116 | - # profile_size) | |
131 | + def desc_profile(self,items_repository,size): | |
132 | + """ | |
133 | + Return most relevant keywords for a list of packages based on their | |
134 | + text descriptions. | |
135 | + """ | |
136 | + enquire = xapian.Enquire(items_repository) | |
137 | + matches = data.axi_search_pkgs(items_repository,self.pkg_profile) | |
138 | + rset_packages = xapian.RSet() | |
139 | + for m in matches: | |
140 | + rset_packages.add_document(m.docid) | |
141 | + eset_keywords = enquire.get_eset(size, rset_packages, | |
142 | + FilterDescription()) | |
143 | + profile = [res.term for res in eset_keywords] | |
144 | + return profile | |
145 | + | |
146 | + def full_profile(self,items_repository,size): | |
147 | + """ | |
148 | + Return most relevant tags and keywords for a list of packages based | |
149 | + their tags and descriptions. | |
150 | + """ | |
151 | + tag_profile = self.tag_profile(items_repository,size)[:size/2] | |
152 | + desc_profile = self.desc_profile(items_repository,size)[:size/2] | |
153 | + return tag_profile+desc_profile | |
117 | 154 | |
118 | 155 | def maximal_pkg_profile(self): |
119 | 156 | """ |
... | ... | @@ -137,7 +174,7 @@ class User: |
137 | 174 | profile_size = len(self.pkg_profile) |
138 | 175 | logging.info("Reduced packages profile size from %d to %d." % |
139 | 176 | (old_profile_size, profile_size)) |
140 | - return set(self.pkg_profile) | |
177 | + return self.pkg_profile | |
141 | 178 | |
142 | 179 | class LocalSystem(User): |
143 | 180 | """ | ... | ... |