Commit 8503780dbddd8c15b311ddf503b63794923e39cf

Authored by Tássia Camões Araújo
1 parent b2ea1ba9
Exists in master and in 1 other branch add_vagrant

data.py: implmented new class SampleAptXapianIndex to perform as data source

for tests and new search functions to retrieve packages and tags directly from apt-xapian-index;
user.py: new user profile methods are based on the kind of content: tag,
description or both;
Implemented more tests for user.py and cleaned source-code from deprecated
classes and functions which used debtags_db;
Showing 3 changed files with 143 additions and 188 deletions   Show diff stats
src/data.py
... ... @@ -35,29 +35,44 @@ from singleton import Singleton
35 35 import cluster
36 36 from dissimilarity import *
37 37  
38   -#class Item:
39   -# """
40   -# Generic item definition.
41   -# """
42   -#
43   -#class Package(Item):
44   -# """
45   -# Definition of a GNU/Linux application as a recommender item.
46   -# """
47   -# def __init__(self,package_name):
48   -# """
49   -# Set initial attributes.
50   -# """
51   -# self.package_name = package_name
52   -#
53   -#def normalize_tags(string):
54   -# """
55   -# Substitute string characters : by _ and - by '.
56   -# Examples:
57   -# admin::package-management -> admin__package'management
58   -# implemented-in::c++ -> implemented-in__c++
59   -# """
60   -# return string.replace(':','_').replace('-','\'')
  38 +def axi_search_pkgs(axi,pkgs_list):
  39 + terms = ["XP"+item for item in pkgs_list]
  40 + query = xapian.Query(xapian.Query.OP_OR, terms)
  41 + enquire = xapian.Enquire(axi)
  42 + enquire.set_query(query)
  43 + matches = enquire.get_mset(0,axi.get_doccount())
  44 + return matches
  45 +
  46 +def axi_search_pkg_tags(axi,pkg):
  47 + query = xapian.Query(xapian.Query.OP_OR, "XP"+pkg)
  48 + enquire = xapian.Enquire(axi)
  49 + enquire.set_query(query)
  50 + matches = enquire.get_mset(0,1)
  51 + for m in matches:
  52 + tags = [term.term for term in axi.get_document(m.docid).termlist() if
  53 + term.term.startswith("XT")]
  54 + return tags
  55 +
  56 +class SampleAptXapianIndex(xapian.WritableDatabase):
  57 + """
  58 + Sample data source for packages information, mainly useful for tests.
  59 + """
  60 + def __init__(self,pkgs_list,axi):
  61 + xapian.WritableDatabase.__init__(self,".sample_axi",
  62 + xapian.DB_CREATE_OR_OVERWRITE)
  63 + sample = axi_search_pkgs(axi,pkgs_list)
  64 + self.all_docs = []
  65 + for package in sample:
  66 + doc_id = self.add_document(axi.get_document(package.docid))
  67 + self.all_docs.append(doc_id)
  68 +
  69 + def _print(self):
  70 + print "---"
  71 + print xapian.WritableDatabase.__repr__(self)
  72 + print "---"
  73 + for doc_id in self.all_docs:
  74 + print [term.term for term in self.get_document(doc_id).termlist()]
  75 + print "---"
61 76  
62 77 #[FIXME] get pkg tags from axi and remove load_debtags_db method
63 78 def load_debtags_db(db_path):
... ... @@ -75,106 +90,6 @@ def load_debtags_db(db_path):
75 90 logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
76 91 raise Error
77 92  
78   -#class TagsXapianIndex(xapian.WritableDatabase,Singleton):
79   -# """
80   -# Data source for tags info defined as a singleton xapian database.
81   -# """
82   -# def __init__(self,cfg):
83   -# """
84   -# Set initial attributes.
85   -# """
86   -# self.path = os.path.expanduser(cfg.tags_index)
87   -# self.db_path = os.path.expanduser(cfg.tags_db)
88   -# self.debtags_db = debtags.DB()
89   -# try:
90   -# db_file = open(self.db_path)
91   -# except IOError:
92   -# logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
93   -# raise Error
94   -# md5 = hashlib.md5()
95   -# md5.update(db_file.read())
96   -# self.db_md5 = md5.hexdigest()
97   -# db_file.close()
98   -# self.load_index(cfg.reindex)
99   -#
100   -## def load_db(self):
101   -## """
102   -## Load debtags database from the source file.
103   -## """
104   -## tag_filter = re.compile(r"^special::.+$|^.+::TODO$")
105   -## try:
106   -## db_file = open(self.db_path, "r")
107   -## self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))
108   -## db_file.close()
109   -## except:
110   -## logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
111   -## raise Error
112   -#
113   -# def relevant_tags_from_db(self,pkgs_list,qtd_of_tags):
114   -# """
115   -# Return most relevant tags considering a list of packages.
116   -# """
117   -# if not self.debtags_db.package_count():
118   -# #print "index vazio"
119   -# self.debtags_db = load_debtags_db(self.db_path)
120   -# relevant_db = self.debtags_db.choose_packages(pkgs_list)
121   -# relevance_index = debtags.relevance_index_function(self.debtags_db,
122   -# relevant_db)
123   -# sorted_relevant_tags = sorted(relevant_db.iter_tags(),
124   -# lambda a, b: cmp(relevance_index(a),
125   -# relevance_index(b)))
126   -# return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:]))
127   -#
128   -# def load_index(self,reindex):
129   -# """
130   -# Load an existing debtags index.
131   -# """
132   -# if not reindex:
133   -# try:
134   -# logging.info("Opening existing debtags xapian index at \'%s\'"
135   -# % self.path)
136   -# xapian.Database.__init__(self,self.path)
137   -# md5 = self.get_metadata("md5")
138   -# if not md5 == self.db_md5:
139   -# logging.info("Index must be updated.")
140   -# reindex = 1
141   -# except xapian.DatabaseError:
142   -# logging.info("Could not open debtags index.")
143   -# reindex =1
144   -#
145   -# if reindex:
146   -# self.new_index()
147   -#
148   -# def new_index(self):
149   -# """
150   -# Create a xapian index for debtags info based on 'debtags_db' and
151   -# place it at 'self.path'.
152   -# """
153   -# if not os.path.exists(self.path):
154   -# os.makedirs(self.path)
155   -#
156   -# try:
157   -# logging.info("Indexing debtags info from \'%s\'" %
158   -# self.db_path)
159   -# logging.info("Creating new xapian index at \'%s\'" %
160   -# self.path)
161   -# xapian.WritableDatabase.__init__(self,self.path,
162   -# xapian.DB_CREATE_OR_OVERWRITE)
163   -# except xapian.DatabaseError:
164   -# logging.critical("Could not create xapian index.")
165   -# raise Error
166   -#
167   -# self.debtags_db = load_debtags_db(self.db_path)
168   -# self.set_metadata("md5",self.db_md5)
169   -#
170   -# for pkg,tags in self.debtags_db.iter_packages_tags():
171   -# doc = xapian.Document()
172   -# doc.set_data(pkg)
173   -# for tag in tags:
174   -# doc.add_term(normalize_tags(tag))
175   -# doc_id = self.add_document(doc)
176   -# logging.debug("Debtags Xapian: Indexing doc %d",doc_id)
177   -
178 93 class PopconXapianIndex(xapian.WritableDatabase,Singleton):
179 94 """
180 95 Data source for popcon submissions defined as a singleton xapian database.
... ...
src/tests/user_tests.py
... ... @@ -36,9 +36,12 @@ class UserTests(unittest2.TestCase):
36 36 @classmethod
37 37 def setUpClass(self):
38 38 cfg = Config()
39   - #self.axi = xapian.Database(cfg.axi)
  39 + self.axi = xapian.Database(cfg.axi)
  40 + sample_packages = ["gimp","aaphoto","eog","emacs","dia","ferret",
  41 + "festival","file","inkscape","xpdf"]
  42 + self.sample_axi = SampleAptXapianIndex(sample_packages,self.axi)
40 43 self.user = User({"gimp":1,"aaphoto":1,"eog":1,"emacs":1})
41   - self.pxi = PkgXapianIndex("package-xapian-index")
  44 + #self.sample_axi._print()
42 45  
43 46 def test_hash(self):
44 47 new_user = User(dict())
... ... @@ -100,34 +103,34 @@ class UserTests(unittest2.TestCase):
100 103 self.assertEqual(self.user.demographic_profile,desktop_art_admin)
101 104  
102 105 def test_items(self):
103   - self.assertEqual(self.user.items(),set(["gimp","aaphoto","eog","emacs"]))
104   -
105   - def test_axi_tag_profile(self):
106   - package_terms = ["XP"+package for package in self.user.items()]
107   - enquire = xapian.Enquire(self.pxi)
108   - enquire.set_query(xapian.Query(xapian.Query.OP_OR,package_terms))
109   - user_packages = enquire.get_mset(0, self.pxi.get_doccount(), None, None)
110   - tag_terms = []
111   - for p in user_packages:
112   - tag_terms = tag_terms + [x.term for x in p.document.termlist() \
113   - if x.term.startswith("XT")]
114   - relevant_count = dict([(tag,tag_terms.count(tag)) \
115   - for tag in set(tag_terms)])
116   - #rank = {}
117   - #non_relevant_count = dict()
118   - #for tag,count in relevant_count.items():
119   - # non_relevant_count[tag] = self.pxi.get_termfreq(tag)-count
120   - # if non_relevant_count[tag]>0:
121   - # rank[tag] = relevant_count[tag]/float(non_relevant_count[tag])
122   - #print "relevant",relevant_count
123   - #print "non_relevant",non_relevant_count
124   - #print sorted(rank.items(), key=operator.itemgetter(1))
125   - #[FIXME] get ths value based on real ranking
126   - #print set(self.user.axi_tag_profile(self.pxi,4))
127   - self.assertEqual(set(self.user.axi_tag_profile(self.pxi,4)),
128   - set(["XTuse::editing", "XTworks-with::image",
129   - "XTworks-with-format::png",
130   - "XTworks-with-format::jpg"]))
  106 + self.assertEqual(set(self.user.items()),
  107 + set(["gimp","aaphoto","eog","emacs"]))
  108 +
  109 + def test_profile(self):
  110 + self.assertEqual(self.user.profile(self.sample_axi,"tag",10),
  111 + self.user.tag_profile(self.sample_axi,10))
  112 + self.assertEqual(self.user.profile(self.sample_axi,"desc",10),
  113 + self.user.desc_profile(self.sample_axi,10))
  114 + self.assertEqual(self.user.profile(self.sample_axi,"full",10),
  115 + self.user.full_profile(self.sample_axi,10))
  116 +
  117 + def test_tag_profile(self):
  118 + self.assertEqual(self.user.tag_profile(self.sample_axi,10),
  119 + ['XTuse::editing', 'XTworks-with::image:raster',
  120 + 'XTworks-with-format::png', 'XTworks-with-format::jpg',
  121 + 'XTworks-with::image','XTimplemented-in::c',
  122 + 'XTsuite::gnome', 'XTsuite::emacs',
  123 + 'XTrole::metapackage', 'XTdevel::editor'])
  124 +
  125 + def test_desc_profile(self):
  126 + self.assertEqual(self.user.desc_profile(self.sample_axi,10),
  127 + ['image', 'the', 'which', 'manipulation', 'program',
  128 + 'input', 'a', 'gnu', 'images', 'this'])
  129 +
  130 + def test_full_profile(self):
  131 + self.assertEqual(self.user.full_profile(self.sample_axi,10),
  132 + (self.user.tag_profile(self.sample_axi,5)+
  133 + self.user.desc_profile(self.sample_axi,5)))
131 134  
132 135 def test_maximal_pkg_profile(self):
133 136 old_pkg_profile = self.user.items()
... ...
src/user.py
... ... @@ -25,6 +25,7 @@ import xapian
25 25 import logging
26 26 import apt
27 27 from singleton import Singleton
  28 +import data
28 29  
29 30 class FilterTag(xapian.ExpandDecider):
30 31 """
... ... @@ -34,7 +35,17 @@ class FilterTag(xapian.ExpandDecider):
34 35 """
35 36 Return true if the term is a tag, else false.
36 37 """
37   - return term[:2] == "XT"
  38 + return term.startswith("XT")
  39 +
  40 +class FilterDescription(xapian.ExpandDecider):
  41 + """
  42 + Extend xapian.ExpandDecider to consider only package description terms.
  43 + """
  44 + def __call__(self, term):
  45 + """
  46 + Return true if the term is a tag, else false.
  47 + """
  48 + return (term.islower())
38 49  
39 50 class DemographicProfile(Singleton):
40 51 def __init__(self):
... ... @@ -63,57 +74,83 @@ class User:
63 74 """
64 75 Define a user of a recommender.
65 76 """
66   - def __init__(self,item_score,user_id=0,profiles_set=0):
  77 + def __init__(self,item_score,user_id=0,demo_profiles_set=0):
67 78 """
68   - Set initial user attributes. If no user_id was passed as parameter, a
69   - random md5-hash is generated for that purpose. If the demographic
70   - profile was not defined, it defaults to 'desktop'
  79 + Set initial user attributes. pkg_profile gets the whole set of items,
  80 + a random user_id is set if none was provided and the demographic
  81 + profile defaults to 'desktop'.
71 82 """
72 83 self.item_score = item_score
  84 + self.pkg_profile = self.items()
  85 +
73 86 if user_id:
74 87 self.id = user_id
75 88 else:
76 89 random.seed()
77 90 self.id = random.getrandbits(128)
78   - self.pkg_profile = self.item_score.keys()
79   - if not profiles_set:
  91 +
  92 + if not demo_profiles_set:
80 93 profiles_set = set(["desktop"])
81 94 self.set_demographic_profile(profiles_set)
82 95  
  96 + def items(self):
  97 + """
  98 + Return the set of user items.
  99 + """
  100 + return self.item_score.keys()
  101 +
83 102 def set_demographic_profile(self,profiles_set):
  103 + """
  104 + Set demographic profle based on labels in 'profiles_set'.
  105 + """
84 106 self.demographic_profile = DemographicProfile()(profiles_set)
85 107  
86   - def items(self):
  108 + def profile(self,items_repository,content,size):
87 109 """
88   - Return the set of user items.
  110 + Get user profile for a specific type of content: packages tags,
  111 + description or both (full_profile)
  112 + """
  113 + if content == "tag": return self.tag_profile(items_repository,size)
  114 + if content == "desc": return self.desc_profile(items_repository,size)
  115 + if content == "full": return self.full_profile(items_repository,size)
  116 +
  117 + def tag_profile(self,items_repository,size):
  118 + """
  119 + Return most relevant tags for a list of packages.
89 120 """
90   - return set(self.item_score.keys())
91   -
92   - def axi_tag_profile(self,apt_xapian_index,profile_size):
93   - """
94   - Return most relevant tags for a list of packages based on axi.
95   - """
96   - terms = ["XP"+item for item in self.pkg_profile]
97   - query = xapian.Query(xapian.Query.OP_OR, terms)
98   - enquire = xapian.Enquire(apt_xapian_index)
99   - enquire.set_query(query)
100   - rset = xapian.RSet()
101   - for m in enquire.get_mset(0,apt_xapian_index.get_doccount()):
102   - rset.add_document(m.docid)
103   - # statistically good differentiators between relevant and non-relevant
104   - eset = enquire.get_eset(profile_size, rset, FilterTag())
105   - profile = []
106   - for res in eset:
107   - profile.append(res.term)
108   - logging.debug("%.2f %s" % (res.weight,res.term.lstrip("XT")))
  121 + enquire = xapian.Enquire(items_repository)
  122 + matches = data.axi_search_pkgs(items_repository,self.pkg_profile)
  123 + rset_packages = xapian.RSet()
  124 + for m in matches:
  125 + rset_packages.add_document(m.docid)
  126 + # statistically good differentiators
  127 + eset_tags = enquire.get_eset(size, rset_packages, FilterTag())
  128 + profile = [res.term for res in eset_tags]
109 129 return profile
110 130  
111   - #def txi_tag_profile(self,tags_xapian_index,profile_size):
112   - # """
113   - # Return most relevant tags for a list of packages based on tags index.
114   - # """
115   - # return tags_xapian_index.relevant_tags_from_db(self.pkg_profile,
116   - # profile_size)
  131 + def desc_profile(self,items_repository,size):
  132 + """
  133 + Return most relevant keywords for a list of packages based on their
  134 + text descriptions.
  135 + """
  136 + enquire = xapian.Enquire(items_repository)
  137 + matches = data.axi_search_pkgs(items_repository,self.pkg_profile)
  138 + rset_packages = xapian.RSet()
  139 + for m in matches:
  140 + rset_packages.add_document(m.docid)
  141 + eset_keywords = enquire.get_eset(size, rset_packages,
  142 + FilterDescription())
  143 + profile = [res.term for res in eset_keywords]
  144 + return profile
  145 +
  146 + def full_profile(self,items_repository,size):
  147 + """
  148 + Return most relevant tags and keywords for a list of packages based
  149 + their tags and descriptions.
  150 + """
  151 + tag_profile = self.tag_profile(items_repository,size)[:size/2]
  152 + desc_profile = self.desc_profile(items_repository,size)[:size/2]
  153 + return tag_profile+desc_profile
117 154  
118 155 def maximal_pkg_profile(self):
119 156 """
... ... @@ -137,7 +174,7 @@ class User:
137 174 profile_size = len(self.pkg_profile)
138 175 logging.info("Reduced packages profile size from %d to %d." %
139 176 (old_profile_size, profile_size))
140   - return set(self.pkg_profile)
  177 + return self.pkg_profile
141 178  
142 179 class LocalSystem(User):
143 180 """
... ...