Commit 9e2461efb830d0a400f46053db69b405b4b14aa4

Authored by Tássia Camões Araújo
1 parent a69758a8
Exists in master and in 1 other branch add_vagrant

Desconsidering TagsXapianIndex for now due to performance issues; Demographic pr…

…ofiles implementation (no strategies yet); Minor bugs fixing.
src/app_recommender.py
... ... @@ -28,7 +28,7 @@ from datetime import timedelta
28 28 from config import *
29 29 from data import *
30 30 from evaluation import *
31   -from similarity import *
  31 +from dissimilarity import *
32 32 from recommender import *
33 33 from strategy import *
34 34 from user import *
... ...
src/clustering.py
... ... @@ -26,7 +26,7 @@ from datetime import timedelta
26 26  
27 27 from config import *
28 28 from data import *
29   -from similarity import *
  29 +from dissimilarity import *
30 30 from error import Error
31 31  
32 32 if __name__ == '__main__':
... ...
src/cross_validation.py
... ... @@ -29,7 +29,7 @@ from datetime import timedelta
29 29 from config import *
30 30 from data import *
31 31 from evaluation import *
32   -from similarity import *
  32 +from dissimilarity import *
33 33 from recommender import *
34 34 from strategy import *
35 35 from user import *
... ...
src/data.py
... ... @@ -28,36 +28,38 @@ import axi
28 28 from debian import debtags
29 29 import logging
30 30 import hashlib
  31 +import random
31 32  
32 33 from error import Error
33 34 from singleton import Singleton
34 35 import cluster
35   -from similarity import *
36   -
37   -class Item:
38   - """
39   - Generic item definition.
40   - """
41   -
42   -class Package(Item):
43   - """
44   - Definition of a GNU/Linux application as a recommender item.
45   - """
46   - def __init__(self,package_name):
47   - """
48   - Set initial attributes.
49   - """
50   - self.package_name = package_name
51   -
52   -def normalize_tags(string):
53   - """
54   - Substitute string characters : by _ and - by '.
55   - Examples:
56   - admin::package-management -> admin__package'management
57   - implemented-in::c++ -> implemented-in__c++
58   - """
59   - return string.replace(':','_').replace('-','\'')
60   -
  36 +from dissimilarity import *
  37 +
  38 +#class Item:
  39 +# """
  40 +# Generic item definition.
  41 +# """
  42 +#
  43 +#class Package(Item):
  44 +# """
  45 +# Definition of a GNU/Linux application as a recommender item.
  46 +# """
  47 +# def __init__(self,package_name):
  48 +# """
  49 +# Set initial attributes.
  50 +# """
  51 +# self.package_name = package_name
  52 +#
  53 +#def normalize_tags(string):
  54 +# """
  55 +# Substitute string characters : by _ and - by '.
  56 +# Examples:
  57 +# admin::package-management -> admin__package'management
  58 +# implemented-in::c++ -> implemented-in__c++
  59 +# """
  60 +# return string.replace(':','_').replace('-','\'')
  61 +
  62 +#[FIXME] get pkg tags from axi and remove load_debtags_db method
61 63 def load_debtags_db(db_path):
62 64 """
63 65 Load debtags database from the source file.
... ... @@ -73,105 +75,105 @@ def load_debtags_db(db_path):
73 75 logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
74 76 raise Error
75 77  
76   -class TagsXapianIndex(xapian.WritableDatabase,Singleton):
77   - """
78   - Data source for tags info defined as a singleton xapian database.
79   - """
80   - def __init__(self,cfg):
81   - """
82   - Set initial attributes.
83   - """
84   - self.path = os.path.expanduser(cfg.tags_index)
85   - self.db_path = os.path.expanduser(cfg.tags_db)
86   - self.debtags_db = debtags.DB()
87   -
88   - try:
89   - db_file = open(self.db_path)
90   - except IOError:
91   - logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
92   - raise Error
93   - md5 = hashlib.md5()
94   - md5.update(db_file.read())
95   - self.db_md5 = md5.hexdigest()
96   - db_file.close()
97   - self.load_index(cfg.reindex)
98   -
99   -# def load_db(self):
  78 +#class TagsXapianIndex(xapian.WritableDatabase,Singleton):
  79 +# """
  80 +# Data source for tags info defined as a singleton xapian database.
  81 +# """
  82 +# def __init__(self,cfg):
100 83 # """
101   -# Load debtags database from the source file.
  84 +# Set initial attributes.
102 85 # """
103   -# tag_filter = re.compile(r"^special::.+$|^.+::TODO$")
  86 +# self.path = os.path.expanduser(cfg.tags_index)
  87 +# self.db_path = os.path.expanduser(cfg.tags_db)
  88 +# self.debtags_db = debtags.DB()
104 89 # try:
105   -# db_file = open(self.db_path, "r")
106   -# self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))
107   -# db_file.close()
108   -# except:
  90 +# db_file = open(self.db_path)
  91 +# except IOError:
109 92 # logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
110 93 # raise Error
111   -
112   - def relevant_tags_from_db(self,pkgs_list,qtd_of_tags):
113   - """
114   - Return most relevant tags considering a list of packages.
115   - """
116   - if not self.debtags_db.package_count():
117   - self.debtags_db = load_debtags_db(self.db_path)
118   - relevant_db = self.debtags_db.choose_packages(pkgs_list)
119   - relevance_index = debtags.relevance_index_function(self.debtags_db,
120   - relevant_db)
121   - sorted_relevant_tags = sorted(relevant_db.iter_tags(),
122   - lambda a, b: cmp(relevance_index(a),
123   - relevance_index(b)))
124   - return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:]))
125   -
126   - def load_index(self,reindex):
127   - """
128   - Load an existing debtags index.
129   - """
130   - if not reindex:
131   - try:
132   - logging.info("Opening existing debtags xapian index at \'%s\'"
133   - % self.path)
134   - xapian.Database.__init__(self,self.path)
135   - md5 = self.get_metadata("md5")
136   - if not md5 == self.db_md5:
137   - logging.info("Index must be updated.")
138   - reindex = 1
139   - except xapian.DatabaseError:
140   - logging.info("Could not open debtags index.")
141   - reindex =1
142   -
143   - if reindex:
144   - self.new_index()
145   -
146   - def new_index(self):
147   - """
148   - Create a xapian index for debtags info based on 'debtags_db' and
149   - place it at 'self.path'.
150   - """
151   - if not os.path.exists(self.path):
152   - os.makedirs(self.path)
153   -
154   - try:
155   - logging.info("Indexing debtags info from \'%s\'" %
156   - self.db_path)
157   - logging.info("Creating new xapian index at \'%s\'" %
158   - self.path)
159   - xapian.WritableDatabase.__init__(self,self.path,
160   - xapian.DB_CREATE_OR_OVERWRITE)
161   - except xapian.DatabaseError:
162   - logging.critical("Could not create xapian index.")
163   - raise Error
164   -
165   - self.debtags_db = load_debtags_db(self.db_path)
166   - self.set_metadata("md5",self.db_md5)
167   -
168   - for pkg,tags in self.debtags_db.iter_packages_tags():
169   - doc = xapian.Document()
170   - doc.set_data(pkg)
171   - for tag in tags:
172   - doc.add_term(normalize_tags(tag))
173   - doc_id = self.add_document(doc)
174   - logging.debug("Debtags Xapian: Indexing doc %d",doc_id)
  94 +# md5 = hashlib.md5()
  95 +# md5.update(db_file.read())
  96 +# self.db_md5 = md5.hexdigest()
  97 +# db_file.close()
  98 +# self.load_index(cfg.reindex)
  99 +#
  100 +## def load_db(self):
  101 +## """
  102 +## Load debtags database from the source file.
  103 +## """
  104 +## tag_filter = re.compile(r"^special::.+$|^.+::TODO$")
  105 +## try:
  106 +## db_file = open(self.db_path, "r")
  107 +## self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))
  108 +## db_file.close()
  109 +## except:
  110 +## logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
  111 +## raise Error
  112 +#
  113 +# def relevant_tags_from_db(self,pkgs_list,qtd_of_tags):
  114 +# """
  115 +# Return most relevant tags considering a list of packages.
  116 +# """
  117 +# if not self.debtags_db.package_count():
  118 +# #print "index vazio"
  119 +# self.debtags_db = load_debtags_db(self.db_path)
  120 +# relevant_db = self.debtags_db.choose_packages(pkgs_list)
  121 +# relevance_index = debtags.relevance_index_function(self.debtags_db,
  122 +# relevant_db)
  123 +# sorted_relevant_tags = sorted(relevant_db.iter_tags(),
  124 +# lambda a, b: cmp(relevance_index(a),
  125 +# relevance_index(b)))
  126 +# return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:]))
  127 +#
  128 +# def load_index(self,reindex):
  129 +# """
  130 +# Load an existing debtags index.
  131 +# """
  132 +# if not reindex:
  133 +# try:
  134 +# logging.info("Opening existing debtags xapian index at \'%s\'"
  135 +# % self.path)
  136 +# xapian.Database.__init__(self,self.path)
  137 +# md5 = self.get_metadata("md5")
  138 +# if not md5 == self.db_md5:
  139 +# logging.info("Index must be updated.")
  140 +# reindex = 1
  141 +# except xapian.DatabaseError:
  142 +# logging.info("Could not open debtags index.")
  143 +# reindex =1
  144 +#
  145 +# if reindex:
  146 +# self.new_index()
  147 +#
  148 +# def new_index(self):
  149 +# """
  150 +# Create a xapian index for debtags info based on 'debtags_db' and
  151 +# place it at 'self.path'.
  152 +# """
  153 +# if not os.path.exists(self.path):
  154 +# os.makedirs(self.path)
  155 +#
  156 +# try:
  157 +# logging.info("Indexing debtags info from \'%s\'" %
  158 +# self.db_path)
  159 +# logging.info("Creating new xapian index at \'%s\'" %
  160 +# self.path)
  161 +# xapian.WritableDatabase.__init__(self,self.path,
  162 +# xapian.DB_CREATE_OR_OVERWRITE)
  163 +# except xapian.DatabaseError:
  164 +# logging.critical("Could not create xapian index.")
  165 +# raise Error
  166 +#
  167 +# self.debtags_db = load_debtags_db(self.db_path)
  168 +# self.set_metadata("md5",self.db_md5)
  169 +#
  170 +# for pkg,tags in self.debtags_db.iter_packages_tags():
  171 +# doc = xapian.Document()
  172 +# doc.set_data(pkg)
  173 +# for tag in tags:
  174 +# doc.add_term(normalize_tags(tag))
  175 +# doc_id = self.add_document(doc)
  176 +# logging.debug("Debtags Xapian: Indexing doc %d",doc_id)
175 177  
176 178 class PopconXapianIndex(xapian.WritableDatabase,Singleton):
177 179 """
... ... @@ -232,7 +234,7 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton):
232 234 """
233 235 if not os.path.exists(self.path):
234 236 os.makedirs(self.path)
235   - debtags_db = load_debtags_db(self.debtags_path)
  237 + debtags_db = load_debtags_db(self.debtags_path) #[FIXME]
236 238  
237 239 try:
238 240 logging.info("Indexing popcon submissions from \'%s\'" %
... ... @@ -254,6 +256,7 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton):
254 256 submission_path)
255 257 for pkg, freq in self.parse_submission(submission_path):
256 258 doc.add_term(pkg,freq)
  259 + #[FIXME] get tags from axi
257 260 for tag in debtags_db.tags_of_package(pkg):
258 261 doc.add_term("XT"+tag,freq)
259 262 doc_id = self.add_document(doc)
... ... @@ -334,22 +337,27 @@ class PopconClusteredData(Singleton):
334 337 s.add_pkg(pkg)
335 338 self.submissions.append(s)
336 339  
337   - distanceFunction = JaccardIndex()
338   - cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list))
339   - clusters = cl.getlevel(0.5)
340   - for c in clusters:
341   - print "cluster"
342   - for submission in c:
343   - print submission.hash
344   - #cl = KMeansClusteringPopcon(self.submissions,
345   - # lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list))
  340 + distanceFunction = JaccardDistance()
  341 + # cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list))
  342 + # clusters = cl.getlevel(0.5)
  343 + # for c in clusters:
  344 + # print "cluster"
  345 + # for submission in c:
  346 + # print submission.hash
  347 + cl = KMedoidsClusteringPopcon(self.submissions, lambda x,y: \
  348 + distanceFunction(x.pkgs_list,y.pkgs_list))
346 349 #clusters = cl.getclusters(2)
347   - #medoids = cl.getMedoids(2)
  350 + medoids = cl.getMedoids(2)
  351 + print "medoids"
  352 + for m in medoids:
  353 + print m.hash
348 354  
349 355 class KMedoidsClusteringPopcon(cluster.KMeansClustering):
350 356  
351 357 def __init__(self,data,distance):
352   - cluster.KMeansClustering.__init__(self, data, distance)
  358 + if len(data)>100:
  359 + data_sample = random.sample(data,100)
  360 + cluster.KMeansClustering.__init__(self, data_sample, distance)
353 361 self.distanceMatrix = {}
354 362 for submission in self._KMeansClustering__data:
355 363 self.distanceMatrix[submission.hash] = {}
... ... @@ -377,7 +385,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering):
377 385 for i in range(len(cluster)):
378 386 totalDistance = sum(self.distanceMatrix[cluster[i].hash].values())
379 387 print "totalDistance[",i,"]=",totalDistance
380   - if totalDistance < centroidDistance:
  388 + if totalDistance < medoidDistance:
381 389 medoidDistance = totalDistance
382 390 medoid = i
383 391 print "medoidDistance:",medoidDistance
... ...
src/dissimilarity.py
1 1 #!/usr/bin/env python
2 2 """
3   - similarity - python module for classes and methods related to similarity
4   - measuring between two sets of data.
  3 + dissimilarity - python module for classes and methods related to similarity
  4 + measuring between two sets of data.
5 5 """
6 6 __author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
7 7 __copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
... ...
src/strategy.py
... ... @@ -144,30 +144,30 @@ class ItemReputationStrategy(RecommendationStrategy):
144 144 logging.critical("Item reputation recommendation strategy is not yet implemented.")
145 145 raise Error
146 146  
147   -class ContentBasedStrategy(RecommendationStrategy):
148   - """
149   - Content-based recommendation strategy.
150   - """
151   - def run(self,rec,user):
152   - """
153   - Perform recommendation strategy.
154   - """
155   - profile = user.txi_tag_profile(rec.items_repository,50)
156   - qp = xapian.QueryParser()
157   - query = qp.parse_query(profile)
158   - enquire = xapian.Enquire(rec.items_repository)
159   - enquire.set_query(query)
160   -
161   - try:
162   - mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items()))
163   - except xapian.DatabaseError as error:
164   - logging.critical(error.get_msg())
165   - raise Error
166   -
167   - item_score = {}
168   - for m in mset:
169   - item_score[m.document.get_data()] = m.rank
170   - return recommender.RecommendationResult(item_score,20)
  147 +#class ContentBasedStrategy(RecommendationStrategy):
  148 +# """
  149 +# Content-based recommendation strategy.
  150 +# """
  151 +# def run(self,rec,user):
  152 +# """
  153 +# Perform recommendation strategy.
  154 +# """
  155 +# profile = user.txi_tag_profile(rec.items_repository,50)
  156 +# qp = xapian.QueryParser()
  157 +# query = qp.parse_query(profile)
  158 +# enquire = xapian.Enquire(rec.items_repository)
  159 +# enquire.set_query(query)
  160 +#
  161 +# try:
  162 +# mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items()))
  163 +# except xapian.DatabaseError as error:
  164 +# logging.critical(error.get_msg())
  165 +# raise Error
  166 +#
  167 +# item_score = {}
  168 +# for m in mset:
  169 +# item_score[m.document.get_data()] = m.rank
  170 +# return recommender.RecommendationResult(item_score,20)
171 171  
172 172 class AxiContentBasedStrategy(RecommendationStrategy):
173 173 """
... ...
src/user.py
... ... @@ -19,10 +19,12 @@ __license__ = &quot;&quot;&quot;
19 19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 20 """
21 21  
  22 +import random
22 23 import commands
23 24 import xapian
24 25 import logging
25 26 import apt
  27 +from singleton import Singleton
26 28  
27 29 class FilterTag(xapian.ExpandDecider):
28 30 """
... ... @@ -34,51 +36,84 @@ class FilterTag(xapian.ExpandDecider):
34 36 """
35 37 return term[:2] == "XT"
36 38  
  39 +class DemographicProfile(Singleton):
  40 + def __init__(self):
  41 + self.admin = set(["admin", "hardware", "mail", "protocol",
  42 + "network", "security", "web", "interface::web"])
  43 + self.devel = set(["devel", "role::devel-lib", "role::shared-lib"])
  44 + self.desktop = set(["x11", "accessibility", "game", "junior", "office",
  45 + "interface::x11"])
  46 + self.art = set(["field::arts", "sound"])
  47 + self.science = set(["science", "biology", "field::astronomy",
  48 + "field::aviation", "field::biology",
  49 + "field::chemistry", "field::eletronics",
  50 + "field::finance", "field::geography",
  51 + "field::geology", "field::linguistics",
  52 + "field::mathematics", "field::medicine",
  53 + "field::meteorology", "field::physics",
  54 + "field::statistics"])
  55 +
  56 + def __call__(self,profiles_set):
  57 + demographic_profile = set()
  58 + for profile in profiles_set:
  59 + demographic_profile = (demographic_profile | eval("self."+profile,{},{"self":self}))
  60 + return demographic_profile
  61 +
37 62 class User:
38 63 """
39 64 Define a user of a recommender.
40 65 """
41   - def __init__(self,item_score,user_id=0,demographic_profile=0):
  66 + def __init__(self,item_score,user_id=0,profiles_set=0):
42 67 """
43   - Set initial parameters.
  68 + Set initial user attributes. If no user_id was passed as parameter, a
  69 + random md5-hash is generated for that purpose. If the demographic
  70 + profile was not defined, it defaults to 'desktop'
44 71 """
45   - self.id = user_id
46 72 self.item_score = item_score
  73 + if user_id:
  74 + self.id = user_id
  75 + else:
  76 + random.seed()
  77 + self.id = random.getrandbits(128)
47 78 self.pkg_profile = self.item_score.keys()
48   - self.demographic_profile = demographic_profile
  79 + if not profiles_set:
  80 + profiles_set = set(["desktop"])
  81 + self.set_demographic_profile(profiles_set)
  82 +
  83 + def set_demographic_profile(self,profiles_set):
  84 + self.demographic_profile = DemographicProfile()(profiles_set)
49 85  
50 86 def items(self):
51 87 """
52   - Return dictionary relating items and repective scores.
  88 + Return the set of user items.
53 89 """
54   - return self.item_score.keys()
  90 + return set(self.item_score.keys())
55 91  
56 92 def axi_tag_profile(self,apt_xapian_index,profile_size):
57 93 """
58 94 Return most relevant tags for a list of packages based on axi.
59 95 """
60   - terms = []
61   - for item in self.pkg_profile:
62   - terms.append("XP"+item)
  96 + terms = ["XP"+item for item in self.pkg_profile]
63 97 query = xapian.Query(xapian.Query.OP_OR, terms)
64 98 enquire = xapian.Enquire(apt_xapian_index)
65 99 enquire.set_query(query)
66 100 rset = xapian.RSet()
67   - for m in enquire.get_mset(0,30000): #consider all matches
  101 + for m in enquire.get_mset(0,apt_xapian_index.get_doccount()):
68 102 rset.add_document(m.docid)
  103 + # statistically good differentiators between relevant and non-relevant
69 104 eset = enquire.get_eset(profile_size, rset, FilterTag())
70 105 profile = []
71 106 for res in eset:
72 107 profile.append(res.term)
73   - logging.debug("%.2f %s" % (res.weight,res.term[2:]))
  108 + logging.debug("%.2f %s" % (res.weight,res.term.lstrip("XT")))
74 109 return profile
75 110  
76   - def txi_tag_profile(self,tags_xapian_index,profile_size):
77   - """
78   - Return most relevant tags for a list of packages based on tags index.
79   - """
80   - return tags_xapian_index.relevant_tags_from_db(self.pkg_profile,
81   - profile_size)
  111 + #def txi_tag_profile(self,tags_xapian_index,profile_size):
  112 + # """
  113 + # Return most relevant tags for a list of packages based on tags index.
  114 + # """
  115 + # return tags_xapian_index.relevant_tags_from_db(self.pkg_profile,
  116 + # profile_size)
82 117  
83 118 def maximal_pkg_profile(self):
84 119 """
... ...