Commit cf91ea7cc61ca6cefa11a9655a44767882b94b35
1 parent
0d315564
Exists in
master
and in
1 other branch
Added DebianPackage class to store package package information from
apt and DDE; Added tfidf weighting methods.
Showing
2 changed files
with
180 additions
and
5 deletions
Show diff stats
... | ... | @@ -0,0 +1,43 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + user_profiling - example script for testing user profiling | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import sys | |
23 | +sys.path.insert(0,'../') | |
24 | +import user | |
25 | +import xapian | |
26 | + | |
27 | +repo = xapian.Database("/home/tassia/.app-recommender/axi_programs") | |
28 | +user = user.LocalSystem() | |
29 | +with open("/home/tassia/.app-recommender/filters/debtags") as tags: | |
30 | + valid_tags = [line.strip() for line in tags if not line.startswith("#")] | |
31 | +size = 20 | |
32 | + | |
33 | +print "\nTF-IDF profiles" | |
34 | +print "\nby tag: ", user.content_profile(repo,"tag",size,valid_tags) | |
35 | +print "\nby desc: ", user.content_profile(repo,"desc",size) | |
36 | +print "\nmix:", user.content_profile(repo,"mix",size,valid_tags) | |
37 | +print "\nhalf", user.content_profile(repo,"half",size,valid_tags) | |
38 | + | |
39 | +print "\nEset profile" | |
40 | +print "\nby tag: ", user.content_profile(repo,"tag_eset",size,valid_tags) | |
41 | +print "\nby desc:", user.content_profile(repo,"desc_eset",size) | |
42 | +print "\nmix: ", user.content_profile(repo,"mix_eset",size,valid_tags) | |
43 | +print "\nhalf", user.content_profile(repo,"half_eset",size,valid_tags) | ... | ... |
src/data.py
... | ... | @@ -27,6 +27,9 @@ import logging |
27 | 27 | import random |
28 | 28 | import cluster |
29 | 29 | import shutil |
30 | +import apt | |
31 | +import re | |
32 | +import operator | |
30 | 33 | |
31 | 34 | from error import Error |
32 | 35 | from singleton import Singleton |
... | ... | @@ -38,8 +41,8 @@ def axi_search_pkgs(axi,pkgs_list): |
38 | 41 | query = xapian.Query(xapian.Query.OP_OR, terms) |
39 | 42 | enquire = xapian.Enquire(axi) |
40 | 43 | enquire.set_query(query) |
41 | - matches = enquire.get_mset(0,axi.get_doccount()) | |
42 | - return [m.docid for m in matches] | |
44 | + mset = enquire.get_mset(0,axi.get_doccount()) | |
45 | + return mset | |
43 | 46 | |
44 | 47 | def axi_search_pkg_tags(axi,pkg): |
45 | 48 | enquire = xapian.Enquire(axi) |
... | ... | @@ -65,6 +68,39 @@ def print_index(index): |
65 | 68 | output += "\n---" |
66 | 69 | return output |
67 | 70 | |
71 | +def tfidf_weighting(index,docs,content_filter,plus=0): | |
72 | + """ | |
73 | + Return a dictionary of terms and weights of all terms of a set of | |
74 | + documents, based on the frequency of terms in the selected set (docids). | |
75 | + """ | |
76 | + # Store all terms in one single document | |
77 | + terms_doc = xapian.Document() | |
78 | + for d in docs: | |
79 | + for term in index.get_document(d.docid).termlist(): | |
80 | + if content_filter(term.term): | |
81 | + if plus: | |
82 | + terms_doc.add_term(term.term,int(d.weight)) | |
83 | + else: | |
84 | + terms_doc.add_term(term.term) | |
85 | + # Compute sublinear tfidf for each term | |
86 | + weights = {} | |
87 | + for term in terms_doc.termlist(): | |
88 | + tf = 1+math.log(term.wdf) | |
89 | + idf = math.log(index.get_doccount()/ | |
90 | + float(index.get_termfreq(term.term))) | |
91 | + weights[term.term] = tf*idf | |
92 | + sorted_weights = list(reversed(sorted(weights.items(), | |
93 | + key=operator.itemgetter(1)))) | |
94 | + #print sorted_weights | |
95 | + return sorted_weights | |
96 | + | |
97 | +def tfidf_plus(index,docs,content_filter): | |
98 | + """ | |
99 | + Return a dictionary of terms and weights of all terms of a set of | |
100 | + documents, based on the frequency of terms in the selected set (docids). | |
101 | + """ | |
102 | + return tfidf_weighting(index,docs,content_filter,1) | |
103 | + | |
68 | 104 | class AppAptXapianIndex(xapian.WritableDatabase): |
69 | 105 | """ |
70 | 106 | Data source for application packages information |
... | ... | @@ -101,11 +137,107 @@ class SampleAptXapianIndex(xapian.WritableDatabase): |
101 | 137 | xapian.DB_CREATE_OR_OVERWRITE) |
102 | 138 | sample = axi_search_pkgs(axi,pkgs_list) |
103 | 139 | for package in sample: |
104 | - doc_id = self.add_document(axi.get_document(package)) | |
140 | + doc_id = self.add_document(axi.get_document(package.docid)) | |
105 | 141 | |
106 | 142 | def __str__(self): |
107 | 143 | return print_index(self) |
108 | 144 | |
145 | +class DebianPackage(): | |
146 | + """ | |
147 | + Class to load package information. | |
148 | + """ | |
149 | + def __init__(self,pkg_name): | |
150 | + self.name = pkg_name | |
151 | + | |
152 | + def load_details_from_apt(self): | |
153 | + pkg_version = apt.Cache()[self.name].candidate | |
154 | + | |
155 | + self.maintainer = pkg_version.record['Maintainer'] | |
156 | + self.version = pkg_version.version | |
157 | + self.summary = pkg_version.summary | |
158 | + self.description = self.format_description(pkg_version.description) | |
159 | + self.summary = pkg_version.section | |
160 | + if pkg_version.record.has_key('Homepage'): | |
161 | + self.homepage = pkg_version.record['Homepage'] | |
162 | + if pkg_version.record.has_key('Tag'): | |
163 | + self.tags = self.debtags_str_to_dict(pkg_version.record['Tag']) | |
164 | + if pkg_version.record.has_key('Depends'): | |
165 | + self.depends = pkg_version.record['Depends'] | |
166 | + if pkg_version.record.has_key('Pre-Depends'): | |
167 | + self.predepends = pkg_version.record['Pre-Depends'] | |
168 | + if pkg_version.record.has_key('Recommends'): | |
169 | + self.recommends = pkg_version.record['Recommends'] | |
170 | + if pkg_version.record.has_key('Suggests'): | |
171 | + self.suggests = pkg_version.record['Suggests'] | |
172 | + if pkg_version.record.has_key('Breaks'): | |
173 | + self.breaks = pkg_version.record['Breaks'] | |
174 | + if pkg_version.record.has_key('Conflicts'): | |
175 | + self.conflicts = pkg_version.record['Conflicts'] | |
176 | + if pkg_version.record.has_key('Replaces'): | |
177 | + self.conflicts = pkg_version.record['Replaces'] | |
178 | + if pkg_version.record.has_key('Provides'): | |
179 | + self.provides = pkg_version.record['Provides'] | |
180 | + | |
181 | + def load_details_from_dde(self,dde_server,dde_port): | |
182 | + json_data = json.load(urllib.urlopen("http://%s:%s/q/udd/packages/all/%s?t=json" | |
183 | + % dde_server,dde_port,self.name)) | |
184 | + | |
185 | + self.maintainer = json_data['r']['maintainer'] | |
186 | + self.version = json_data['r']['version'] | |
187 | + self.summary = json_data['r']['description'] | |
188 | + self.description = self.format_description(json_data['r']['long_description']) | |
189 | + self.section = json_data['r']['section'] | |
190 | + if json_data['r']['homepage']: | |
191 | + self.conflicts = json_data['r']['homepage'] | |
192 | + if json_data['r']['tag']: | |
193 | + self.tags = self.debtags_list_to_dict(json_data['r']['tag']) | |
194 | + if json_data['r']['depends']: | |
195 | + self.depends = json_data['r']['depends'] | |
196 | + if json_data['r']['pre_depends']: | |
197 | + self.conflicts = json_data['r']['pre_depends'] | |
198 | + if json_data['r']['recommends']: | |
199 | + self.conflicts = json_data['r']['recommends'] | |
200 | + if json_data['r']['suggests']: | |
201 | + self.conflicts = json_data['r']['suggests'] | |
202 | + if json_data['r']['conflicts']: | |
203 | + self.conflicts = json_data['r']['conflicts'] | |
204 | + if json_data['r']['replaces']: | |
205 | + self.conflicts = json_data['r']['replaces'] | |
206 | + if json_data['r']['provides']: | |
207 | + self.conflicts = json_data['r']['provides'] | |
208 | + self.popcon_insts = json_data['r']['popcon']['insts'] | |
209 | + | |
210 | + def format_description(self,description): | |
211 | + return description.replace('.\n','').replace('\n','<br />') | |
212 | + | |
213 | + def debtags_str_to_dict(self, debtags_str): | |
214 | + debtags_list = [tag.rstrip(",") for tag in debtags_str.split()] | |
215 | + return self.debtags_list_to_dict(debtags_list) | |
216 | + | |
217 | + def debtags_list_to_dict(self, debtags_list): | |
218 | + """ input: ['use::editing', | |
219 | + 'works-with-format::gif', | |
220 | + 'works-with-format::jpg', | |
221 | + 'works-with-format::pdf'] | |
222 | + output: {'use': [editing], | |
223 | + 'works-with-format': ['gif', 'jpg', 'pdf']'} | |
224 | + """ | |
225 | + debtags = {} | |
226 | + subtags = [] | |
227 | + for tag in debtags_list: | |
228 | + match = re.search(r'^(.*)::(.*)$', tag) | |
229 | + if not match: | |
230 | + logging.info("Could not parse debtags format from tag: %s", tag) | |
231 | + facet, subtag = match.groups() | |
232 | + subtags.append(subtag) | |
233 | + if facet not in debtags: | |
234 | + debtags[facet] = subtags | |
235 | + else: | |
236 | + debtags[facet].append(subtag) | |
237 | + subtags = [] | |
238 | + print "debtags_list",debtags | |
239 | + return debtags | |
240 | + | |
109 | 241 | class PopconSubmission(): |
110 | 242 | def __init__(self,path,user_id=0,binary=1): |
111 | 243 | self.packages = dict() |
... | ... | @@ -174,11 +306,11 @@ class PopconXapianIndex(xapian.WritableDatabase): |
174 | 306 | self.max_popcon = cfg.max_popcon |
175 | 307 | self.valid_pkgs = [] |
176 | 308 | # file format for filter: one package name per line |
177 | - with open(os.path.join(cfg.filters,cfg.pkgs_filter)) as valid_pkgs: | |
309 | + with open(cfg.pkgs_filter) as valid_pkgs: | |
178 | 310 | self.valid_pkgs = [line.strip() for line in valid_pkgs |
179 | 311 | if not line.startswith("#")] |
180 | 312 | logging.debug("Considering %d valid packages" % len(self.valid_pkgs)) |
181 | - with open(os.path.join(cfg.filters,"tags")) as valid_tags: | |
313 | + with open(os.path.join(cfg.filters_dir,"debtags")) as valid_tags: | |
182 | 314 | self.valid_tags = [line.strip() for line in valid_tags |
183 | 315 | if not line.startswith("#")] |
184 | 316 | logging.debug("Considering %d valid tags" % len(self.valid_tags)) | ... | ... |