Commit cf91ea7cc61ca6cefa11a9655a44767882b94b35
1 parent
0d315564
Exists in
master
and in
1 other branch
Added DebianPackage class to store package package information from
apt and DDE; Added tfidf weighting methods.
Showing
2 changed files
with
180 additions
and
5 deletions
Show diff stats
@@ -0,0 +1,43 @@ | @@ -0,0 +1,43 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + user_profiling - example script for testing user profiling | ||
4 | +""" | ||
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
7 | +__license__ = """ | ||
8 | + This program is free software: you can redistribute it and/or modify | ||
9 | + it under the terms of the GNU General Public License as published by | ||
10 | + the Free Software Foundation, either version 3 of the License, or | ||
11 | + (at your option) any later version. | ||
12 | + | ||
13 | + This program is distributed in the hope that it will be useful, | ||
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | + GNU General Public License for more details. | ||
17 | + | ||
18 | + You should have received a copy of the GNU General Public License | ||
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | +""" | ||
21 | + | ||
22 | +import sys | ||
23 | +sys.path.insert(0,'../') | ||
24 | +import user | ||
25 | +import xapian | ||
26 | + | ||
27 | +repo = xapian.Database("/home/tassia/.app-recommender/axi_programs") | ||
28 | +user = user.LocalSystem() | ||
29 | +with open("/home/tassia/.app-recommender/filters/debtags") as tags: | ||
30 | + valid_tags = [line.strip() for line in tags if not line.startswith("#")] | ||
31 | +size = 20 | ||
32 | + | ||
33 | +print "\nTF-IDF profiles" | ||
34 | +print "\nby tag: ", user.content_profile(repo,"tag",size,valid_tags) | ||
35 | +print "\nby desc: ", user.content_profile(repo,"desc",size) | ||
36 | +print "\nmix:", user.content_profile(repo,"mix",size,valid_tags) | ||
37 | +print "\nhalf", user.content_profile(repo,"half",size,valid_tags) | ||
38 | + | ||
39 | +print "\nEset profile" | ||
40 | +print "\nby tag: ", user.content_profile(repo,"tag_eset",size,valid_tags) | ||
41 | +print "\nby desc:", user.content_profile(repo,"desc_eset",size) | ||
42 | +print "\nmix: ", user.content_profile(repo,"mix_eset",size,valid_tags) | ||
43 | +print "\nhalf", user.content_profile(repo,"half_eset",size,valid_tags) |
src/data.py
@@ -27,6 +27,9 @@ import logging | @@ -27,6 +27,9 @@ import logging | ||
27 | import random | 27 | import random |
28 | import cluster | 28 | import cluster |
29 | import shutil | 29 | import shutil |
30 | +import apt | ||
31 | +import re | ||
32 | +import operator | ||
30 | 33 | ||
31 | from error import Error | 34 | from error import Error |
32 | from singleton import Singleton | 35 | from singleton import Singleton |
@@ -38,8 +41,8 @@ def axi_search_pkgs(axi,pkgs_list): | @@ -38,8 +41,8 @@ def axi_search_pkgs(axi,pkgs_list): | ||
38 | query = xapian.Query(xapian.Query.OP_OR, terms) | 41 | query = xapian.Query(xapian.Query.OP_OR, terms) |
39 | enquire = xapian.Enquire(axi) | 42 | enquire = xapian.Enquire(axi) |
40 | enquire.set_query(query) | 43 | enquire.set_query(query) |
41 | - matches = enquire.get_mset(0,axi.get_doccount()) | ||
42 | - return [m.docid for m in matches] | 44 | + mset = enquire.get_mset(0,axi.get_doccount()) |
45 | + return mset | ||
43 | 46 | ||
44 | def axi_search_pkg_tags(axi,pkg): | 47 | def axi_search_pkg_tags(axi,pkg): |
45 | enquire = xapian.Enquire(axi) | 48 | enquire = xapian.Enquire(axi) |
@@ -65,6 +68,39 @@ def print_index(index): | @@ -65,6 +68,39 @@ def print_index(index): | ||
65 | output += "\n---" | 68 | output += "\n---" |
66 | return output | 69 | return output |
67 | 70 | ||
71 | +def tfidf_weighting(index,docs,content_filter,plus=0): | ||
72 | + """ | ||
73 | + Return a dictionary of terms and weights of all terms of a set of | ||
74 | + documents, based on the frequency of terms in the selected set (docids). | ||
75 | + """ | ||
76 | + # Store all terms in one single document | ||
77 | + terms_doc = xapian.Document() | ||
78 | + for d in docs: | ||
79 | + for term in index.get_document(d.docid).termlist(): | ||
80 | + if content_filter(term.term): | ||
81 | + if plus: | ||
82 | + terms_doc.add_term(term.term,int(d.weight)) | ||
83 | + else: | ||
84 | + terms_doc.add_term(term.term) | ||
85 | + # Compute sublinear tfidf for each term | ||
86 | + weights = {} | ||
87 | + for term in terms_doc.termlist(): | ||
88 | + tf = 1+math.log(term.wdf) | ||
89 | + idf = math.log(index.get_doccount()/ | ||
90 | + float(index.get_termfreq(term.term))) | ||
91 | + weights[term.term] = tf*idf | ||
92 | + sorted_weights = list(reversed(sorted(weights.items(), | ||
93 | + key=operator.itemgetter(1)))) | ||
94 | + #print sorted_weights | ||
95 | + return sorted_weights | ||
96 | + | ||
97 | +def tfidf_plus(index,docs,content_filter): | ||
98 | + """ | ||
99 | + Return a dictionary of terms and weights of all terms of a set of | ||
100 | + documents, based on the frequency of terms in the selected set (docids). | ||
101 | + """ | ||
102 | + return tfidf_weighting(index,docs,content_filter,1) | ||
103 | + | ||
68 | class AppAptXapianIndex(xapian.WritableDatabase): | 104 | class AppAptXapianIndex(xapian.WritableDatabase): |
69 | """ | 105 | """ |
70 | Data source for application packages information | 106 | Data source for application packages information |
@@ -101,11 +137,107 @@ class SampleAptXapianIndex(xapian.WritableDatabase): | @@ -101,11 +137,107 @@ class SampleAptXapianIndex(xapian.WritableDatabase): | ||
101 | xapian.DB_CREATE_OR_OVERWRITE) | 137 | xapian.DB_CREATE_OR_OVERWRITE) |
102 | sample = axi_search_pkgs(axi,pkgs_list) | 138 | sample = axi_search_pkgs(axi,pkgs_list) |
103 | for package in sample: | 139 | for package in sample: |
104 | - doc_id = self.add_document(axi.get_document(package)) | 140 | + doc_id = self.add_document(axi.get_document(package.docid)) |
105 | 141 | ||
106 | def __str__(self): | 142 | def __str__(self): |
107 | return print_index(self) | 143 | return print_index(self) |
108 | 144 | ||
145 | +class DebianPackage(): | ||
146 | + """ | ||
147 | + Class to load package information. | ||
148 | + """ | ||
149 | + def __init__(self,pkg_name): | ||
150 | + self.name = pkg_name | ||
151 | + | ||
152 | + def load_details_from_apt(self): | ||
153 | + pkg_version = apt.Cache()[self.name].candidate | ||
154 | + | ||
155 | + self.maintainer = pkg_version.record['Maintainer'] | ||
156 | + self.version = pkg_version.version | ||
157 | + self.summary = pkg_version.summary | ||
158 | + self.description = self.format_description(pkg_version.description) | ||
159 | + self.summary = pkg_version.section | ||
160 | + if pkg_version.record.has_key('Homepage'): | ||
161 | + self.homepage = pkg_version.record['Homepage'] | ||
162 | + if pkg_version.record.has_key('Tag'): | ||
163 | + self.tags = self.debtags_str_to_dict(pkg_version.record['Tag']) | ||
164 | + if pkg_version.record.has_key('Depends'): | ||
165 | + self.depends = pkg_version.record['Depends'] | ||
166 | + if pkg_version.record.has_key('Pre-Depends'): | ||
167 | + self.predepends = pkg_version.record['Pre-Depends'] | ||
168 | + if pkg_version.record.has_key('Recommends'): | ||
169 | + self.recommends = pkg_version.record['Recommends'] | ||
170 | + if pkg_version.record.has_key('Suggests'): | ||
171 | + self.suggests = pkg_version.record['Suggests'] | ||
172 | + if pkg_version.record.has_key('Breaks'): | ||
173 | + self.breaks = pkg_version.record['Breaks'] | ||
174 | + if pkg_version.record.has_key('Conflicts'): | ||
175 | + self.conflicts = pkg_version.record['Conflicts'] | ||
176 | + if pkg_version.record.has_key('Replaces'): | ||
177 | + self.conflicts = pkg_version.record['Replaces'] | ||
178 | + if pkg_version.record.has_key('Provides'): | ||
179 | + self.provides = pkg_version.record['Provides'] | ||
180 | + | ||
181 | + def load_details_from_dde(self,dde_server,dde_port): | ||
182 | + json_data = json.load(urllib.urlopen("http://%s:%s/q/udd/packages/all/%s?t=json" | ||
183 | + % dde_server,dde_port,self.name)) | ||
184 | + | ||
185 | + self.maintainer = json_data['r']['maintainer'] | ||
186 | + self.version = json_data['r']['version'] | ||
187 | + self.summary = json_data['r']['description'] | ||
188 | + self.description = self.format_description(json_data['r']['long_description']) | ||
189 | + self.section = json_data['r']['section'] | ||
190 | + if json_data['r']['homepage']: | ||
191 | + self.conflicts = json_data['r']['homepage'] | ||
192 | + if json_data['r']['tag']: | ||
193 | + self.tags = self.debtags_list_to_dict(json_data['r']['tag']) | ||
194 | + if json_data['r']['depends']: | ||
195 | + self.depends = json_data['r']['depends'] | ||
196 | + if json_data['r']['pre_depends']: | ||
197 | + self.conflicts = json_data['r']['pre_depends'] | ||
198 | + if json_data['r']['recommends']: | ||
199 | + self.conflicts = json_data['r']['recommends'] | ||
200 | + if json_data['r']['suggests']: | ||
201 | + self.conflicts = json_data['r']['suggests'] | ||
202 | + if json_data['r']['conflicts']: | ||
203 | + self.conflicts = json_data['r']['conflicts'] | ||
204 | + if json_data['r']['replaces']: | ||
205 | + self.conflicts = json_data['r']['replaces'] | ||
206 | + if json_data['r']['provides']: | ||
207 | + self.conflicts = json_data['r']['provides'] | ||
208 | + self.popcon_insts = json_data['r']['popcon']['insts'] | ||
209 | + | ||
210 | + def format_description(self,description): | ||
211 | + return description.replace('.\n','').replace('\n','<br />') | ||
212 | + | ||
213 | + def debtags_str_to_dict(self, debtags_str): | ||
214 | + debtags_list = [tag.rstrip(",") for tag in debtags_str.split()] | ||
215 | + return self.debtags_list_to_dict(debtags_list) | ||
216 | + | ||
217 | + def debtags_list_to_dict(self, debtags_list): | ||
218 | + """ input: ['use::editing', | ||
219 | + 'works-with-format::gif', | ||
220 | + 'works-with-format::jpg', | ||
221 | + 'works-with-format::pdf'] | ||
222 | + output: {'use': [editing], | ||
223 | + 'works-with-format': ['gif', 'jpg', 'pdf']'} | ||
224 | + """ | ||
225 | + debtags = {} | ||
226 | + subtags = [] | ||
227 | + for tag in debtags_list: | ||
228 | + match = re.search(r'^(.*)::(.*)$', tag) | ||
229 | + if not match: | ||
230 | + logging.info("Could not parse debtags format from tag: %s", tag) | ||
231 | + facet, subtag = match.groups() | ||
232 | + subtags.append(subtag) | ||
233 | + if facet not in debtags: | ||
234 | + debtags[facet] = subtags | ||
235 | + else: | ||
236 | + debtags[facet].append(subtag) | ||
237 | + subtags = [] | ||
238 | + print "debtags_list",debtags | ||
239 | + return debtags | ||
240 | + | ||
109 | class PopconSubmission(): | 241 | class PopconSubmission(): |
110 | def __init__(self,path,user_id=0,binary=1): | 242 | def __init__(self,path,user_id=0,binary=1): |
111 | self.packages = dict() | 243 | self.packages = dict() |
@@ -174,11 +306,11 @@ class PopconXapianIndex(xapian.WritableDatabase): | @@ -174,11 +306,11 @@ class PopconXapianIndex(xapian.WritableDatabase): | ||
174 | self.max_popcon = cfg.max_popcon | 306 | self.max_popcon = cfg.max_popcon |
175 | self.valid_pkgs = [] | 307 | self.valid_pkgs = [] |
176 | # file format for filter: one package name per line | 308 | # file format for filter: one package name per line |
177 | - with open(os.path.join(cfg.filters,cfg.pkgs_filter)) as valid_pkgs: | 309 | + with open(cfg.pkgs_filter) as valid_pkgs: |
178 | self.valid_pkgs = [line.strip() for line in valid_pkgs | 310 | self.valid_pkgs = [line.strip() for line in valid_pkgs |
179 | if not line.startswith("#")] | 311 | if not line.startswith("#")] |
180 | logging.debug("Considering %d valid packages" % len(self.valid_pkgs)) | 312 | logging.debug("Considering %d valid packages" % len(self.valid_pkgs)) |
181 | - with open(os.path.join(cfg.filters,"tags")) as valid_tags: | 313 | + with open(os.path.join(cfg.filters_dir,"debtags")) as valid_tags: |
182 | self.valid_tags = [line.strip() for line in valid_tags | 314 | self.valid_tags = [line.strip() for line in valid_tags |
183 | if not line.startswith("#")] | 315 | if not line.startswith("#")] |
184 | logging.debug("Considering %d valid tags" % len(self.valid_tags)) | 316 | logging.debug("Considering %d valid tags" % len(self.valid_tags)) |