Commit e6bf05b1c28a63af33232ce0457e665f04c831d0
1 parent
0b42f57e
Exists in
master
and in
1 other branch
Collaborative strategy implementation.
Showing
5 changed files
with
241 additions
and
27 deletions
Show diff stats
src/config.py
... | ... | @@ -41,6 +41,8 @@ class Config(): |
41 | 41 | self.tags_index = "~/.app-recommender/debtags_index" |
42 | 42 | self.axi = "/var/lib/apt-xapian-index/index" |
43 | 43 | self.axi_values = "/var/lib/apt-xapian-index/values" |
44 | + self.popcon_index = "~/.app-recommender/popcon_index" | |
45 | + self.popcon_dir = "~/.app-recommender/popcon_dir" | |
44 | 46 | self.strategy = "ct" # defaults to the cheapest one |
45 | 47 | self.reindex = 0 |
46 | 48 | self.load_options() |
... | ... | @@ -62,6 +64,8 @@ class Config(): |
62 | 64 | print " -i, --tagsindex=PATH Path to debtags dedicated index." |
63 | 65 | print " -r, --force-reindex Force reindexing debtags database." |
64 | 66 | print " -a, --axi=PATH Path to Apt-xapian-index." |
67 | + print " -p, --popconindex=PATH Path to popcon dedicated index." | |
68 | + print " -m, --popcondir=PATH Path to popcon submissions dir." | |
65 | 69 | print " -s, --strategy=OPTION Recommendation strategy." |
66 | 70 | print "" |
67 | 71 | print " [ strategy options ] " |
... | ... | @@ -104,10 +108,13 @@ class Config(): |
104 | 108 | self.tags_index = self.read_option('recommender', 'tags_index') |
105 | 109 | self.reindex = self.read_option('recommender', 'reindex') |
106 | 110 | self.axi = self.read_option('recommender', 'axi') |
111 | + self.popcon_index = self.read_option('recommender', 'popcon_index') | |
112 | + self.popcon_dir = self.read_option('recommender', 'popcon_dir') | |
107 | 113 | |
108 | - short_options = "hdvo:c:t:i:ra:s:" | |
114 | + short_options = "hdvo:c:t:i:ra:p:m:s:" | |
109 | 115 | long_options = ["help", "debug", "verbose", "output=", "config=", |
110 | - "tagsdb=", "tagsindex=", "reindex", "axi=", "strategy="] | |
116 | + "tagsdb=", "tagsindex=", "reindex", "axi=", | |
117 | + "popconindex=", "popcondir=", "strategy="] | |
111 | 118 | try: |
112 | 119 | opts, args = getopt.getopt(sys.argv[1:], short_options, |
113 | 120 | long_options) |
... | ... | @@ -138,6 +145,10 @@ class Config(): |
138 | 145 | elif o in ("-a", "--axi"): |
139 | 146 | self.axi = p + "/index" |
140 | 147 | self.axi_values = p + "/values" |
148 | + elif o in ("-p", "--popconindex"): | |
149 | + self.popcon_index = p | |
150 | + elif o in ("-p", "--popcondir"): | |
151 | + self.popcon_dir = p | |
141 | 152 | elif o in ("-s", "--strategy"): |
142 | 153 | self.strategy = p |
143 | 154 | else: | ... | ... |
src/data.py
... | ... | @@ -19,6 +19,7 @@ |
19 | 19 | |
20 | 20 | import os |
21 | 21 | import sys |
22 | +import gc | |
22 | 23 | import re |
23 | 24 | import xapian |
24 | 25 | import axi |
... | ... | @@ -53,6 +54,21 @@ def normalize_tags(string): |
53 | 54 | """ |
54 | 55 | return string.replace(':','_').replace('-','\'') |
55 | 56 | |
57 | +def load_debtags_db(db_path): | |
58 | + """ | |
59 | + Load debtags database from the source file. | |
60 | + """ | |
61 | + tag_filter = re.compile(r"^special::.+$|^.+::TODO$") | |
62 | + try: | |
63 | + db_file = open(db_path, "r") | |
64 | + debtags_db = debtags.DB() | |
65 | + debtags_db.read(db_file,lambda x: not tag_filter.match(x)) | |
66 | + db_file.close() | |
67 | + return debtags_db | |
68 | + except: | |
69 | + logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | |
70 | + raise Error | |
71 | + | |
56 | 72 | class TagsXapianIndex(xapian.WritableDatabase,Singleton): |
57 | 73 | """ |
58 | 74 | Data source for tags info defined as a singleton xapian database. |
... | ... | @@ -76,25 +92,25 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): |
76 | 92 | db_file.close() |
77 | 93 | self.load_index(cfg.reindex) |
78 | 94 | |
79 | - def load_db(self): | |
80 | - """ | |
81 | - Load debtags database from the source file. | |
82 | - """ | |
83 | - tag_filter = re.compile(r"^special::.+$|^.+::TODO$") | |
84 | - try: | |
85 | - db_file = open(self.db_path, "r") | |
86 | - self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) | |
87 | - db_file.close() | |
88 | - except: | |
89 | - logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | |
90 | - raise Error | |
95 | +# def load_db(self): | |
96 | +# """ | |
97 | +# Load debtags database from the source file. | |
98 | +# """ | |
99 | +# tag_filter = re.compile(r"^special::.+$|^.+::TODO$") | |
100 | +# try: | |
101 | +# db_file = open(self.db_path, "r") | |
102 | +# self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) | |
103 | +# db_file.close() | |
104 | +# except: | |
105 | +# logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | |
106 | +# raise Error | |
91 | 107 | |
92 | 108 | def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): |
93 | 109 | """ |
94 | 110 | Return most relevant tags considering a list of packages. |
95 | 111 | """ |
96 | 112 | if not self.debtags_db.package_count(): |
97 | - self.load_db() | |
113 | + self.debtags_db = load_debtags_db(self.db_path) | |
98 | 114 | relevant_db = self.debtags_db.choose_packages(pkgs_list) |
99 | 115 | relevance_index = debtags.relevance_index_function(self.debtags_db, |
100 | 116 | relevant_db) |
... | ... | @@ -117,7 +133,7 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): |
117 | 133 | logging.info("Index must be updated.") |
118 | 134 | reindex = 1 |
119 | 135 | except xapian.DatabaseError: |
120 | - logging.info("Could not open index.") | |
136 | + logging.info("Could not open debtags index.") | |
121 | 137 | reindex =1 |
122 | 138 | |
123 | 139 | if reindex: |
... | ... | @@ -126,13 +142,15 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): |
126 | 142 | def new_index(self): |
127 | 143 | """ |
128 | 144 | Create a xapian index for debtags info based on 'debtags_db' and |
129 | - place it at 'index_path'. | |
145 | + place it at 'self.path'. | |
130 | 146 | """ |
131 | 147 | if not os.path.exists(self.path): |
132 | 148 | os.makedirs(self.path) |
133 | 149 | |
134 | 150 | try: |
135 | - logging.info("Creating new xapian index for debtags at \'%s\'" % | |
151 | + logging.info("Indexing debtags info from \'%s\'" % | |
152 | + self.db_path) | |
153 | + logging.info("Creating new xapian index at \'%s\'" % | |
136 | 154 | self.path) |
137 | 155 | xapian.WritableDatabase.__init__(self,self.path, |
138 | 156 | xapian.DB_CREATE_OR_OVERWRITE) |
... | ... | @@ -140,7 +158,7 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): |
140 | 158 | logging.critical("Could not create xapian index.") |
141 | 159 | raise Error |
142 | 160 | |
143 | - self.load_db() | |
161 | + self.debtags_db = load_debtags_db(self.db_path) | |
144 | 162 | self.set_metadata("md5",self.db_md5) |
145 | 163 | |
146 | 164 | for pkg,tags in self.debtags_db.iter_packages_tags(): |
... | ... | @@ -149,4 +167,94 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): |
149 | 167 | for tag in tags: |
150 | 168 | doc.add_term(normalize_tags(tag)) |
151 | 169 | doc_id = self.add_document(doc) |
152 | - logging.debug("Indexing doc %d",doc_id) | |
170 | + logging.debug("Debtags Xapian: Indexing doc %d",doc_id) | |
171 | + | |
172 | +class PopconXapianIndex(xapian.WritableDatabase,Singleton): | |
173 | + """ | |
174 | + Data source for popcon submissions defined as a singleton xapian database. | |
175 | + """ | |
176 | + def __init__(self,cfg): | |
177 | + """ | |
178 | + Set initial attributes. | |
179 | + """ | |
180 | + self.path = os.path.expanduser(cfg.popcon_index) | |
181 | + self.popcon_dir = os.path.expanduser(cfg.popcon_dir) | |
182 | + self.debtags_path = os.path.expanduser(cfg.tags_db) | |
183 | + self.load_index() | |
184 | + | |
185 | + def parse_submission(self,submission_path,binary=1): | |
186 | + """ | |
187 | + Parse a popcon submission, generating the names of the valid packages | |
188 | + in the vote. | |
189 | + """ | |
190 | + submission = open(submission_path) | |
191 | + for line in submission: | |
192 | + if not line.startswith("POPULARITY"): | |
193 | + if not line.startswith("END-POPULARITY"): | |
194 | + data = line[:-1].split(" ") | |
195 | + if len(data) > 3: | |
196 | + if binary: | |
197 | + # every installed package has the same weight | |
198 | + yield data[2], 1 | |
199 | + elif data[3] == '<NOFILES>': | |
200 | + # No executable files to track | |
201 | + yield data[2], 1 | |
202 | + elif len(data) == 4: | |
203 | + # Recently used packages | |
204 | + yield data[2], 10 | |
205 | + elif data[4] == '<OLD>': | |
206 | + # Unused packages | |
207 | + yield data[2], 3 | |
208 | + elif data[4] == '<RECENT-CTIME>': | |
209 | + # Recently installed packages | |
210 | + yield data[2], 8 | |
211 | + | |
212 | + def load_index(self): | |
213 | + """ | |
214 | + Load an existing popcon index. | |
215 | + """ | |
216 | + try: | |
217 | + logging.info("Opening existing popcon xapian index at \'%s\'" | |
218 | + % self.path) | |
219 | + xapian.Database.__init__(self,self.path) | |
220 | + except xapian.DatabaseError: | |
221 | + logging.info("Could not open popcon index.") | |
222 | + self.new_index() | |
223 | + | |
224 | + def new_index(self): | |
225 | + """ | |
226 | + Create a xapian index for popcon submissions at 'popcon_dir' and | |
227 | + place it at 'self.path'. | |
228 | + """ | |
229 | + if not os.path.exists(self.path): | |
230 | + os.makedirs(self.path) | |
231 | + debtags_db = load_debtags_db(self.debtags_path) | |
232 | + | |
233 | + try: | |
234 | + logging.info("Indexing popcon submissions from \'%s\'" % | |
235 | + self.popcon_dir) | |
236 | + logging.info("Creating new xapian index at \'%s\'" % | |
237 | + self.path) | |
238 | + xapian.WritableDatabase.__init__(self,self.path, | |
239 | + xapian.DB_CREATE_OR_OVERWRITE) | |
240 | + except xapian.DatabaseError: | |
241 | + logging.critical("Could not create popcon xapian index.") | |
242 | + raise Error | |
243 | + | |
244 | + for root, dirs, files in os.walk(self.popcon_dir): | |
245 | + for submission in files: | |
246 | + submission_path = os.path.join(root, submission) | |
247 | + doc = xapian.Document() | |
248 | + doc.set_data(submission) | |
249 | + logging.debug("Parsing popcon submission at \'%s\'" % | |
250 | + submission_path) | |
251 | + for pkg, freq in self.parse_submission(submission_path): | |
252 | + doc.add_term(pkg,freq) | |
253 | + for tag in debtags_db.tags_of_package(pkg): | |
254 | + doc.add_term("XT"+tag,freq) | |
255 | + doc_id = self.add_document(doc) | |
256 | + logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) | |
257 | + # python garbage collector | |
258 | + gc.collect() | |
259 | + # flush to disk database changes | |
260 | + self.flush() | ... | ... |
src/recommender.py
... | ... | @@ -83,6 +83,14 @@ class Recommender: |
83 | 83 | self.items_repository = xapian.Database(cfg.axi) |
84 | 84 | self.strategy = AxiContentBasedStrategy() |
85 | 85 | |
86 | + def col(self,cfg): | |
87 | + """ | |
88 | + Set recommender attributes to perform collaborative recommendation | |
89 | + using popcon-xapian-index as source data. | |
90 | + """ | |
91 | + self.users_repository = PopconXapianIndex(cfg) | |
92 | + self.strategy = CollaborativeStrategy() | |
93 | + | |
86 | 94 | def set_strategy(self,strategy): |
87 | 95 | """ |
88 | 96 | Set the recommendation strategy. | ... | ... |
src/strategy.py
... | ... | @@ -48,7 +48,6 @@ class PopularityHeuristic(ReputationHeuristic): |
48 | 48 | """ |
49 | 49 | pass |
50 | 50 | |
51 | - | |
52 | 51 | class PkgMatchDecider(xapian.MatchDecider): |
53 | 52 | """ |
54 | 53 | Extend xapian.MatchDecider to not consider installed packages. |
... | ... | @@ -67,6 +66,64 @@ class PkgMatchDecider(xapian.MatchDecider): |
67 | 66 | """ |
68 | 67 | return doc.get_data() not in self.installed_pkgs |
69 | 68 | |
69 | +class UserMatchDecider(xapian.MatchDecider): | |
70 | + """ | |
71 | + Extend xapian.MatchDecider to match similar profiles. | |
72 | + """ | |
73 | + | |
74 | + def __init__(self, profile): | |
75 | + """ | |
76 | + Set initial parameters. | |
77 | + """ | |
78 | + xapian.MatchDecider.__init__(self) | |
79 | + self.profile = profile | |
80 | + print "mdecider:",profile | |
81 | + | |
82 | + def __call__(self, doc): | |
83 | + """ | |
84 | + True if the user has more the half of packages from profile. | |
85 | + """ | |
86 | + profile_size = len(self.profile) | |
87 | + pkg_match=0 | |
88 | + for term in doc: | |
89 | + if term.term in self.profile: | |
90 | + pkg_match = pkg_match+1 | |
91 | + print "id",doc.get_docid(),"match",pkg_match | |
92 | + return pkg_match >= profile_size/2 | |
93 | + | |
94 | +class PkgExpandDecider(xapian.ExpandDecider): | |
95 | + """ | |
96 | + Extend xapian.ExpandDecider to consider packages only. | |
97 | + """ | |
98 | + | |
99 | + def __init__(self): | |
100 | + """ | |
101 | + Call base class init. | |
102 | + """ | |
103 | + xapian.ExpandDecider.__init__(self) | |
104 | + | |
105 | + def __call__(self, term): | |
106 | + """ | |
107 | + True if the term is a package. | |
108 | + """ | |
109 | + return not term.startswith("XT") | |
110 | + | |
111 | +class TagExpandDecider(xapian.ExpandDecider): | |
112 | + """ | |
113 | + Extend xapian.ExpandDecider to consider tags only. | |
114 | + """ | |
115 | + | |
116 | + def __init__(self, profile): | |
117 | + """ | |
118 | + Call base class init. | |
119 | + """ | |
120 | + xapian.ExpandDecider.__init__(self) | |
121 | + | |
122 | + def __call__(self, doc): | |
123 | + """ | |
124 | + True if the user has more the half of packages from profile. | |
125 | + """ | |
126 | + return term.startswith("XT") | |
70 | 127 | |
71 | 128 | class RecommendationStrategy: |
72 | 129 | """ |
... | ... | @@ -82,7 +139,8 @@ class ItemReputationStrategy(RecommendationStrategy): |
82 | 139 | """ |
83 | 140 | Perform recommendation strategy. |
84 | 141 | """ |
85 | - return RecomendationResult() | |
142 | + logging.critical("Item reputation recommendation strategy is not yet implemented.") | |
143 | + raise Error | |
86 | 144 | |
87 | 145 | class ContentBasedStrategy(RecommendationStrategy): |
88 | 146 | """ |
... | ... | @@ -133,15 +191,41 @@ class AxiContentBasedStrategy(RecommendationStrategy): |
133 | 191 | item_score[m.document.get_data()] = m.rank |
134 | 192 | return recommender.RecommendationResult(item_score,20) |
135 | 193 | |
136 | -class ColaborativeStrategy(RecommendationStrategy): | |
194 | +class CollaborativeStrategy(RecommendationStrategy): | |
137 | 195 | """ |
138 | 196 | Colaborative recommendation strategy. |
139 | 197 | """ |
140 | - def run(self,user,users_repository,similarity_measure): | |
198 | + #def run(self,rec,user,similarity_measure): | |
199 | + def run(self,rec,user): | |
141 | 200 | """ |
142 | 201 | Perform recommendation strategy. |
143 | 202 | """ |
144 | - return RecomendationResult() | |
203 | + profile = user.maximal_pkg_profile() | |
204 | + query = xapian.Query(xapian.Query.OP_OR,profile) | |
205 | + enquire = xapian.Enquire(rec.users_repository) | |
206 | + enquire.set_query(query) | |
207 | + | |
208 | + try: | |
209 | + #mset = enquire.get_mset(0, 182, None, UserMatchDecider(profile)) | |
210 | + mset = enquire.get_mset(0, 20) | |
211 | + except xapian.DatabaseError as error: | |
212 | + logging.critical(error.get_msg()) | |
213 | + raise Error | |
214 | + | |
215 | + rset = xapian.RSet() | |
216 | + for m in mset: | |
217 | + rset.add_document(m.document.get_docid()) | |
218 | + logging.debug("Counting as relevant submission %s" % | |
219 | + m.document.get_data()) | |
220 | + | |
221 | + eset = enquire.get_eset(20,rset,PkgExpandDecider()) | |
222 | + rank = 0 | |
223 | + item_score = {} | |
224 | + for term in eset: | |
225 | + item_score[term.term] = rank | |
226 | + rank = rank+1 | |
227 | + | |
228 | + return recommender.RecommendationResult(item_score,20) | |
145 | 229 | |
146 | 230 | class KnowledgeBasedStrategy(RecommendationStrategy): |
147 | 231 | """ |
... | ... | @@ -151,7 +235,8 @@ class KnowledgeBasedStrategy(RecommendationStrategy): |
151 | 235 | """ |
152 | 236 | Perform recommendation strategy. |
153 | 237 | """ |
154 | - return RecomendationResult() | |
238 | + logging.critical("Knowledge-based recommendation strategy is not yet implemented.") | |
239 | + raise Error | |
155 | 240 | |
156 | 241 | class DemographicStrategy(RecommendationStrategy): |
157 | 242 | """ |
... | ... | @@ -161,4 +246,5 @@ class DemographicStrategy(RecommendationStrategy): |
161 | 246 | """ |
162 | 247 | Perform recommendation strategy. |
163 | 248 | """ |
164 | - return RecomendationResult() | |
249 | + logging.critical("Demographic recommendation strategy is not yet implemented.") | |
250 | + raise Error | ... | ... |
src/user.py