Commit 9e2461efb830d0a400f46053db69b405b4b14aa4
1 parent
a69758a8
Exists in
master
and in
1 other branch
Desconsidering TagsXapianIndex for now due to performance issues; Demographic pr…
…ofiles implementation (no strategies yet); Minor bugs fixing.
Showing
7 changed files
with
222 additions
and
179 deletions
Show diff stats
src/app_recommender.py
src/clustering.py
src/cross_validation.py
src/data.py
... | ... | @@ -28,36 +28,38 @@ import axi |
28 | 28 | from debian import debtags |
29 | 29 | import logging |
30 | 30 | import hashlib |
31 | +import random | |
31 | 32 | |
32 | 33 | from error import Error |
33 | 34 | from singleton import Singleton |
34 | 35 | import cluster |
35 | -from similarity import * | |
36 | - | |
37 | -class Item: | |
38 | - """ | |
39 | - Generic item definition. | |
40 | - """ | |
41 | - | |
42 | -class Package(Item): | |
43 | - """ | |
44 | - Definition of a GNU/Linux application as a recommender item. | |
45 | - """ | |
46 | - def __init__(self,package_name): | |
47 | - """ | |
48 | - Set initial attributes. | |
49 | - """ | |
50 | - self.package_name = package_name | |
51 | - | |
52 | -def normalize_tags(string): | |
53 | - """ | |
54 | - Substitute string characters : by _ and - by '. | |
55 | - Examples: | |
56 | - admin::package-management -> admin__package'management | |
57 | - implemented-in::c++ -> implemented-in__c++ | |
58 | - """ | |
59 | - return string.replace(':','_').replace('-','\'') | |
60 | - | |
36 | +from dissimilarity import * | |
37 | + | |
38 | +#class Item: | |
39 | +# """ | |
40 | +# Generic item definition. | |
41 | +# """ | |
42 | +# | |
43 | +#class Package(Item): | |
44 | +# """ | |
45 | +# Definition of a GNU/Linux application as a recommender item. | |
46 | +# """ | |
47 | +# def __init__(self,package_name): | |
48 | +# """ | |
49 | +# Set initial attributes. | |
50 | +# """ | |
51 | +# self.package_name = package_name | |
52 | +# | |
53 | +#def normalize_tags(string): | |
54 | +# """ | |
55 | +# Substitute string characters : by _ and - by '. | |
56 | +# Examples: | |
57 | +# admin::package-management -> admin__package'management | |
58 | +# implemented-in::c++ -> implemented-in__c++ | |
59 | +# """ | |
60 | +# return string.replace(':','_').replace('-','\'') | |
61 | + | |
62 | +#[FIXME] get pkg tags from axi and remove load_debtags_db method | |
61 | 63 | def load_debtags_db(db_path): |
62 | 64 | """ |
63 | 65 | Load debtags database from the source file. |
... | ... | @@ -73,105 +75,105 @@ def load_debtags_db(db_path): |
73 | 75 | logging.error("Could not load DebtagsDB from '%s'." % self.db_path) |
74 | 76 | raise Error |
75 | 77 | |
76 | -class TagsXapianIndex(xapian.WritableDatabase,Singleton): | |
77 | - """ | |
78 | - Data source for tags info defined as a singleton xapian database. | |
79 | - """ | |
80 | - def __init__(self,cfg): | |
81 | - """ | |
82 | - Set initial attributes. | |
83 | - """ | |
84 | - self.path = os.path.expanduser(cfg.tags_index) | |
85 | - self.db_path = os.path.expanduser(cfg.tags_db) | |
86 | - self.debtags_db = debtags.DB() | |
87 | - | |
88 | - try: | |
89 | - db_file = open(self.db_path) | |
90 | - except IOError: | |
91 | - logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | |
92 | - raise Error | |
93 | - md5 = hashlib.md5() | |
94 | - md5.update(db_file.read()) | |
95 | - self.db_md5 = md5.hexdigest() | |
96 | - db_file.close() | |
97 | - self.load_index(cfg.reindex) | |
98 | - | |
99 | -# def load_db(self): | |
78 | +#class TagsXapianIndex(xapian.WritableDatabase,Singleton): | |
79 | +# """ | |
80 | +# Data source for tags info defined as a singleton xapian database. | |
81 | +# """ | |
82 | +# def __init__(self,cfg): | |
100 | 83 | # """ |
101 | -# Load debtags database from the source file. | |
84 | +# Set initial attributes. | |
102 | 85 | # """ |
103 | -# tag_filter = re.compile(r"^special::.+$|^.+::TODO$") | |
86 | +# self.path = os.path.expanduser(cfg.tags_index) | |
87 | +# self.db_path = os.path.expanduser(cfg.tags_db) | |
88 | +# self.debtags_db = debtags.DB() | |
104 | 89 | # try: |
105 | -# db_file = open(self.db_path, "r") | |
106 | -# self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) | |
107 | -# db_file.close() | |
108 | -# except: | |
90 | +# db_file = open(self.db_path) | |
91 | +# except IOError: | |
109 | 92 | # logging.error("Could not load DebtagsDB from '%s'." % self.db_path) |
110 | 93 | # raise Error |
111 | - | |
112 | - def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): | |
113 | - """ | |
114 | - Return most relevant tags considering a list of packages. | |
115 | - """ | |
116 | - if not self.debtags_db.package_count(): | |
117 | - self.debtags_db = load_debtags_db(self.db_path) | |
118 | - relevant_db = self.debtags_db.choose_packages(pkgs_list) | |
119 | - relevance_index = debtags.relevance_index_function(self.debtags_db, | |
120 | - relevant_db) | |
121 | - sorted_relevant_tags = sorted(relevant_db.iter_tags(), | |
122 | - lambda a, b: cmp(relevance_index(a), | |
123 | - relevance_index(b))) | |
124 | - return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:])) | |
125 | - | |
126 | - def load_index(self,reindex): | |
127 | - """ | |
128 | - Load an existing debtags index. | |
129 | - """ | |
130 | - if not reindex: | |
131 | - try: | |
132 | - logging.info("Opening existing debtags xapian index at \'%s\'" | |
133 | - % self.path) | |
134 | - xapian.Database.__init__(self,self.path) | |
135 | - md5 = self.get_metadata("md5") | |
136 | - if not md5 == self.db_md5: | |
137 | - logging.info("Index must be updated.") | |
138 | - reindex = 1 | |
139 | - except xapian.DatabaseError: | |
140 | - logging.info("Could not open debtags index.") | |
141 | - reindex =1 | |
142 | - | |
143 | - if reindex: | |
144 | - self.new_index() | |
145 | - | |
146 | - def new_index(self): | |
147 | - """ | |
148 | - Create a xapian index for debtags info based on 'debtags_db' and | |
149 | - place it at 'self.path'. | |
150 | - """ | |
151 | - if not os.path.exists(self.path): | |
152 | - os.makedirs(self.path) | |
153 | - | |
154 | - try: | |
155 | - logging.info("Indexing debtags info from \'%s\'" % | |
156 | - self.db_path) | |
157 | - logging.info("Creating new xapian index at \'%s\'" % | |
158 | - self.path) | |
159 | - xapian.WritableDatabase.__init__(self,self.path, | |
160 | - xapian.DB_CREATE_OR_OVERWRITE) | |
161 | - except xapian.DatabaseError: | |
162 | - logging.critical("Could not create xapian index.") | |
163 | - raise Error | |
164 | - | |
165 | - self.debtags_db = load_debtags_db(self.db_path) | |
166 | - self.set_metadata("md5",self.db_md5) | |
167 | - | |
168 | - for pkg,tags in self.debtags_db.iter_packages_tags(): | |
169 | - doc = xapian.Document() | |
170 | - doc.set_data(pkg) | |
171 | - for tag in tags: | |
172 | - doc.add_term(normalize_tags(tag)) | |
173 | - doc_id = self.add_document(doc) | |
174 | - logging.debug("Debtags Xapian: Indexing doc %d",doc_id) | |
94 | +# md5 = hashlib.md5() | |
95 | +# md5.update(db_file.read()) | |
96 | +# self.db_md5 = md5.hexdigest() | |
97 | +# db_file.close() | |
98 | +# self.load_index(cfg.reindex) | |
99 | +# | |
100 | +## def load_db(self): | |
101 | +## """ | |
102 | +## Load debtags database from the source file. | |
103 | +## """ | |
104 | +## tag_filter = re.compile(r"^special::.+$|^.+::TODO$") | |
105 | +## try: | |
106 | +## db_file = open(self.db_path, "r") | |
107 | +## self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) | |
108 | +## db_file.close() | |
109 | +## except: | |
110 | +## logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | |
111 | +## raise Error | |
112 | +# | |
113 | +# def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): | |
114 | +# """ | |
115 | +# Return most relevant tags considering a list of packages. | |
116 | +# """ | |
117 | +# if not self.debtags_db.package_count(): | |
118 | +# #print "index vazio" | |
119 | +# self.debtags_db = load_debtags_db(self.db_path) | |
120 | +# relevant_db = self.debtags_db.choose_packages(pkgs_list) | |
121 | +# relevance_index = debtags.relevance_index_function(self.debtags_db, | |
122 | +# relevant_db) | |
123 | +# sorted_relevant_tags = sorted(relevant_db.iter_tags(), | |
124 | +# lambda a, b: cmp(relevance_index(a), | |
125 | +# relevance_index(b))) | |
126 | +# return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:])) | |
127 | +# | |
128 | +# def load_index(self,reindex): | |
129 | +# """ | |
130 | +# Load an existing debtags index. | |
131 | +# """ | |
132 | +# if not reindex: | |
133 | +# try: | |
134 | +# logging.info("Opening existing debtags xapian index at \'%s\'" | |
135 | +# % self.path) | |
136 | +# xapian.Database.__init__(self,self.path) | |
137 | +# md5 = self.get_metadata("md5") | |
138 | +# if not md5 == self.db_md5: | |
139 | +# logging.info("Index must be updated.") | |
140 | +# reindex = 1 | |
141 | +# except xapian.DatabaseError: | |
142 | +# logging.info("Could not open debtags index.") | |
143 | +# reindex =1 | |
144 | +# | |
145 | +# if reindex: | |
146 | +# self.new_index() | |
147 | +# | |
148 | +# def new_index(self): | |
149 | +# """ | |
150 | +# Create a xapian index for debtags info based on 'debtags_db' and | |
151 | +# place it at 'self.path'. | |
152 | +# """ | |
153 | +# if not os.path.exists(self.path): | |
154 | +# os.makedirs(self.path) | |
155 | +# | |
156 | +# try: | |
157 | +# logging.info("Indexing debtags info from \'%s\'" % | |
158 | +# self.db_path) | |
159 | +# logging.info("Creating new xapian index at \'%s\'" % | |
160 | +# self.path) | |
161 | +# xapian.WritableDatabase.__init__(self,self.path, | |
162 | +# xapian.DB_CREATE_OR_OVERWRITE) | |
163 | +# except xapian.DatabaseError: | |
164 | +# logging.critical("Could not create xapian index.") | |
165 | +# raise Error | |
166 | +# | |
167 | +# self.debtags_db = load_debtags_db(self.db_path) | |
168 | +# self.set_metadata("md5",self.db_md5) | |
169 | +# | |
170 | +# for pkg,tags in self.debtags_db.iter_packages_tags(): | |
171 | +# doc = xapian.Document() | |
172 | +# doc.set_data(pkg) | |
173 | +# for tag in tags: | |
174 | +# doc.add_term(normalize_tags(tag)) | |
175 | +# doc_id = self.add_document(doc) | |
176 | +# logging.debug("Debtags Xapian: Indexing doc %d",doc_id) | |
175 | 177 | |
176 | 178 | class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
177 | 179 | """ |
... | ... | @@ -232,7 +234,7 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
232 | 234 | """ |
233 | 235 | if not os.path.exists(self.path): |
234 | 236 | os.makedirs(self.path) |
235 | - debtags_db = load_debtags_db(self.debtags_path) | |
237 | + debtags_db = load_debtags_db(self.debtags_path) #[FIXME] | |
236 | 238 | |
237 | 239 | try: |
238 | 240 | logging.info("Indexing popcon submissions from \'%s\'" % |
... | ... | @@ -254,6 +256,7 @@ class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
254 | 256 | submission_path) |
255 | 257 | for pkg, freq in self.parse_submission(submission_path): |
256 | 258 | doc.add_term(pkg,freq) |
259 | + #[FIXME] get tags from axi | |
257 | 260 | for tag in debtags_db.tags_of_package(pkg): |
258 | 261 | doc.add_term("XT"+tag,freq) |
259 | 262 | doc_id = self.add_document(doc) |
... | ... | @@ -334,22 +337,27 @@ class PopconClusteredData(Singleton): |
334 | 337 | s.add_pkg(pkg) |
335 | 338 | self.submissions.append(s) |
336 | 339 | |
337 | - distanceFunction = JaccardIndex() | |
338 | - cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | |
339 | - clusters = cl.getlevel(0.5) | |
340 | - for c in clusters: | |
341 | - print "cluster" | |
342 | - for submission in c: | |
343 | - print submission.hash | |
344 | - #cl = KMeansClusteringPopcon(self.submissions, | |
345 | - # lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | |
340 | + distanceFunction = JaccardDistance() | |
341 | + # cl = cluster.HierarchicalClustering(self.submissions,lambda x,y: distanceFunction(x.pkgs_list,y.pkgs_list)) | |
342 | + # clusters = cl.getlevel(0.5) | |
343 | + # for c in clusters: | |
344 | + # print "cluster" | |
345 | + # for submission in c: | |
346 | + # print submission.hash | |
347 | + cl = KMedoidsClusteringPopcon(self.submissions, lambda x,y: \ | |
348 | + distanceFunction(x.pkgs_list,y.pkgs_list)) | |
346 | 349 | #clusters = cl.getclusters(2) |
347 | - #medoids = cl.getMedoids(2) | |
350 | + medoids = cl.getMedoids(2) | |
351 | + print "medoids" | |
352 | + for m in medoids: | |
353 | + print m.hash | |
348 | 354 | |
349 | 355 | class KMedoidsClusteringPopcon(cluster.KMeansClustering): |
350 | 356 | |
351 | 357 | def __init__(self,data,distance): |
352 | - cluster.KMeansClustering.__init__(self, data, distance) | |
358 | + if len(data)>100: | |
359 | + data_sample = random.sample(data,100) | |
360 | + cluster.KMeansClustering.__init__(self, data_sample, distance) | |
353 | 361 | self.distanceMatrix = {} |
354 | 362 | for submission in self._KMeansClustering__data: |
355 | 363 | self.distanceMatrix[submission.hash] = {} |
... | ... | @@ -377,7 +385,7 @@ class KMedoidsClusteringPopcon(cluster.KMeansClustering): |
377 | 385 | for i in range(len(cluster)): |
378 | 386 | totalDistance = sum(self.distanceMatrix[cluster[i].hash].values()) |
379 | 387 | print "totalDistance[",i,"]=",totalDistance |
380 | - if totalDistance < centroidDistance: | |
388 | + if totalDistance < medoidDistance: | |
381 | 389 | medoidDistance = totalDistance |
382 | 390 | medoid = i |
383 | 391 | print "medoidDistance:",medoidDistance | ... | ... |
src/dissimilarity.py
1 | 1 | #!/usr/bin/env python |
2 | 2 | """ |
3 | - similarity - python module for classes and methods related to similarity | |
4 | - measuring between two sets of data. | |
3 | + dissimilarity - python module for classes and methods related to similarity | |
4 | + measuring between two sets of data. | |
5 | 5 | """ |
6 | 6 | __author__ = "Tassia Camoes Araujo <tassia@gmail.com>" |
7 | 7 | __copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ... | ... |
src/strategy.py
... | ... | @@ -144,30 +144,30 @@ class ItemReputationStrategy(RecommendationStrategy): |
144 | 144 | logging.critical("Item reputation recommendation strategy is not yet implemented.") |
145 | 145 | raise Error |
146 | 146 | |
147 | -class ContentBasedStrategy(RecommendationStrategy): | |
148 | - """ | |
149 | - Content-based recommendation strategy. | |
150 | - """ | |
151 | - def run(self,rec,user): | |
152 | - """ | |
153 | - Perform recommendation strategy. | |
154 | - """ | |
155 | - profile = user.txi_tag_profile(rec.items_repository,50) | |
156 | - qp = xapian.QueryParser() | |
157 | - query = qp.parse_query(profile) | |
158 | - enquire = xapian.Enquire(rec.items_repository) | |
159 | - enquire.set_query(query) | |
160 | - | |
161 | - try: | |
162 | - mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | |
163 | - except xapian.DatabaseError as error: | |
164 | - logging.critical(error.get_msg()) | |
165 | - raise Error | |
166 | - | |
167 | - item_score = {} | |
168 | - for m in mset: | |
169 | - item_score[m.document.get_data()] = m.rank | |
170 | - return recommender.RecommendationResult(item_score,20) | |
147 | +#class ContentBasedStrategy(RecommendationStrategy): | |
148 | +# """ | |
149 | +# Content-based recommendation strategy. | |
150 | +# """ | |
151 | +# def run(self,rec,user): | |
152 | +# """ | |
153 | +# Perform recommendation strategy. | |
154 | +# """ | |
155 | +# profile = user.txi_tag_profile(rec.items_repository,50) | |
156 | +# qp = xapian.QueryParser() | |
157 | +# query = qp.parse_query(profile) | |
158 | +# enquire = xapian.Enquire(rec.items_repository) | |
159 | +# enquire.set_query(query) | |
160 | +# | |
161 | +# try: | |
162 | +# mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | |
163 | +# except xapian.DatabaseError as error: | |
164 | +# logging.critical(error.get_msg()) | |
165 | +# raise Error | |
166 | +# | |
167 | +# item_score = {} | |
168 | +# for m in mset: | |
169 | +# item_score[m.document.get_data()] = m.rank | |
170 | +# return recommender.RecommendationResult(item_score,20) | |
171 | 171 | |
172 | 172 | class AxiContentBasedStrategy(RecommendationStrategy): |
173 | 173 | """ | ... | ... |
src/user.py
... | ... | @@ -19,10 +19,12 @@ __license__ = """ |
19 | 19 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | 20 | """ |
21 | 21 | |
22 | +import random | |
22 | 23 | import commands |
23 | 24 | import xapian |
24 | 25 | import logging |
25 | 26 | import apt |
27 | +from singleton import Singleton | |
26 | 28 | |
27 | 29 | class FilterTag(xapian.ExpandDecider): |
28 | 30 | """ |
... | ... | @@ -34,51 +36,84 @@ class FilterTag(xapian.ExpandDecider): |
34 | 36 | """ |
35 | 37 | return term[:2] == "XT" |
36 | 38 | |
39 | +class DemographicProfile(Singleton): | |
40 | + def __init__(self): | |
41 | + self.admin = set(["admin", "hardware", "mail", "protocol", | |
42 | + "network", "security", "web", "interface::web"]) | |
43 | + self.devel = set(["devel", "role::devel-lib", "role::shared-lib"]) | |
44 | + self.desktop = set(["x11", "accessibility", "game", "junior", "office", | |
45 | + "interface::x11"]) | |
46 | + self.art = set(["field::arts", "sound"]) | |
47 | + self.science = set(["science", "biology", "field::astronomy", | |
48 | + "field::aviation", "field::biology", | |
49 | + "field::chemistry", "field::eletronics", | |
50 | + "field::finance", "field::geography", | |
51 | + "field::geology", "field::linguistics", | |
52 | + "field::mathematics", "field::medicine", | |
53 | + "field::meteorology", "field::physics", | |
54 | + "field::statistics"]) | |
55 | + | |
56 | + def __call__(self,profiles_set): | |
57 | + demographic_profile = set() | |
58 | + for profile in profiles_set: | |
59 | + demographic_profile = (demographic_profile | eval("self."+profile,{},{"self":self})) | |
60 | + return demographic_profile | |
61 | + | |
37 | 62 | class User: |
38 | 63 | """ |
39 | 64 | Define a user of a recommender. |
40 | 65 | """ |
41 | - def __init__(self,item_score,user_id=0,demographic_profile=0): | |
66 | + def __init__(self,item_score,user_id=0,profiles_set=0): | |
42 | 67 | """ |
43 | - Set initial parameters. | |
68 | + Set initial user attributes. If no user_id was passed as parameter, a | |
69 | + random md5-hash is generated for that purpose. If the demographic | |
70 | + profile was not defined, it defaults to 'desktop' | |
44 | 71 | """ |
45 | - self.id = user_id | |
46 | 72 | self.item_score = item_score |
73 | + if user_id: | |
74 | + self.id = user_id | |
75 | + else: | |
76 | + random.seed() | |
77 | + self.id = random.getrandbits(128) | |
47 | 78 | self.pkg_profile = self.item_score.keys() |
48 | - self.demographic_profile = demographic_profile | |
79 | + if not profiles_set: | |
80 | + profiles_set = set(["desktop"]) | |
81 | + self.set_demographic_profile(profiles_set) | |
82 | + | |
83 | + def set_demographic_profile(self,profiles_set): | |
84 | + self.demographic_profile = DemographicProfile()(profiles_set) | |
49 | 85 | |
50 | 86 | def items(self): |
51 | 87 | """ |
52 | - Return dictionary relating items and repective scores. | |
88 | + Return the set of user items. | |
53 | 89 | """ |
54 | - return self.item_score.keys() | |
90 | + return set(self.item_score.keys()) | |
55 | 91 | |
56 | 92 | def axi_tag_profile(self,apt_xapian_index,profile_size): |
57 | 93 | """ |
58 | 94 | Return most relevant tags for a list of packages based on axi. |
59 | 95 | """ |
60 | - terms = [] | |
61 | - for item in self.pkg_profile: | |
62 | - terms.append("XP"+item) | |
96 | + terms = ["XP"+item for item in self.pkg_profile] | |
63 | 97 | query = xapian.Query(xapian.Query.OP_OR, terms) |
64 | 98 | enquire = xapian.Enquire(apt_xapian_index) |
65 | 99 | enquire.set_query(query) |
66 | 100 | rset = xapian.RSet() |
67 | - for m in enquire.get_mset(0,30000): #consider all matches | |
101 | + for m in enquire.get_mset(0,apt_xapian_index.get_doccount()): | |
68 | 102 | rset.add_document(m.docid) |
103 | + # statistically good differentiators between relevant and non-relevant | |
69 | 104 | eset = enquire.get_eset(profile_size, rset, FilterTag()) |
70 | 105 | profile = [] |
71 | 106 | for res in eset: |
72 | 107 | profile.append(res.term) |
73 | - logging.debug("%.2f %s" % (res.weight,res.term[2:])) | |
108 | + logging.debug("%.2f %s" % (res.weight,res.term.lstrip("XT"))) | |
74 | 109 | return profile |
75 | 110 | |
76 | - def txi_tag_profile(self,tags_xapian_index,profile_size): | |
77 | - """ | |
78 | - Return most relevant tags for a list of packages based on tags index. | |
79 | - """ | |
80 | - return tags_xapian_index.relevant_tags_from_db(self.pkg_profile, | |
81 | - profile_size) | |
111 | + #def txi_tag_profile(self,tags_xapian_index,profile_size): | |
112 | + # """ | |
113 | + # Return most relevant tags for a list of packages based on tags index. | |
114 | + # """ | |
115 | + # return tags_xapian_index.relevant_tags_from_db(self.pkg_profile, | |
116 | + # profile_size) | |
82 | 117 | |
83 | 118 | def maximal_pkg_profile(self): |
84 | 119 | """ | ... | ... |