Commit cf91ea7cc61ca6cefa11a9655a44767882b94b35

Authored by Tássia Camões Araújo
1 parent 0d315564
Exists in master and in 1 other branch add_vagrant

Added DebianPackage class to store package package information from

apt and DDE; Added tfidf weighting methods.
Showing 2 changed files with 180 additions and 5 deletions   Show diff stats
src/bin/user_profiling.py 0 → 100755
... ... @@ -0,0 +1,43 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + user_profiling - example script for testing user profiling
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +import user
  25 +import xapian
  26 +
  27 +repo = xapian.Database("/home/tassia/.app-recommender/axi_programs")
  28 +user = user.LocalSystem()
  29 +with open("/home/tassia/.app-recommender/filters/debtags") as tags:
  30 + valid_tags = [line.strip() for line in tags if not line.startswith("#")]
  31 +size = 20
  32 +
  33 +print "\nTF-IDF profiles"
  34 +print "\nby tag: ", user.content_profile(repo,"tag",size,valid_tags)
  35 +print "\nby desc: ", user.content_profile(repo,"desc",size)
  36 +print "\nmix:", user.content_profile(repo,"mix",size,valid_tags)
  37 +print "\nhalf", user.content_profile(repo,"half",size,valid_tags)
  38 +
  39 +print "\nEset profile"
  40 +print "\nby tag: ", user.content_profile(repo,"tag_eset",size,valid_tags)
  41 +print "\nby desc:", user.content_profile(repo,"desc_eset",size)
  42 +print "\nmix: ", user.content_profile(repo,"mix_eset",size,valid_tags)
  43 +print "\nhalf", user.content_profile(repo,"half_eset",size,valid_tags)
... ...
src/data.py
... ... @@ -27,6 +27,9 @@ import logging
27 27 import random
28 28 import cluster
29 29 import shutil
  30 +import apt
  31 +import re
  32 +import operator
30 33  
31 34 from error import Error
32 35 from singleton import Singleton
... ... @@ -38,8 +41,8 @@ def axi_search_pkgs(axi,pkgs_list):
38 41 query = xapian.Query(xapian.Query.OP_OR, terms)
39 42 enquire = xapian.Enquire(axi)
40 43 enquire.set_query(query)
41   - matches = enquire.get_mset(0,axi.get_doccount())
42   - return [m.docid for m in matches]
  44 + mset = enquire.get_mset(0,axi.get_doccount())
  45 + return mset
43 46  
44 47 def axi_search_pkg_tags(axi,pkg):
45 48 enquire = xapian.Enquire(axi)
... ... @@ -65,6 +68,39 @@ def print_index(index):
65 68 output += "\n---"
66 69 return output
67 70  
  71 +def tfidf_weighting(index,docs,content_filter,plus=0):
  72 + """
  73 + Return a dictionary of terms and weights of all terms of a set of
  74 + documents, based on the frequency of terms in the selected set (docids).
  75 + """
  76 + # Store all terms in one single document
  77 + terms_doc = xapian.Document()
  78 + for d in docs:
  79 + for term in index.get_document(d.docid).termlist():
  80 + if content_filter(term.term):
  81 + if plus:
  82 + terms_doc.add_term(term.term,int(d.weight))
  83 + else:
  84 + terms_doc.add_term(term.term)
  85 + # Compute sublinear tfidf for each term
  86 + weights = {}
  87 + for term in terms_doc.termlist():
  88 + tf = 1+math.log(term.wdf)
  89 + idf = math.log(index.get_doccount()/
  90 + float(index.get_termfreq(term.term)))
  91 + weights[term.term] = tf*idf
  92 + sorted_weights = list(reversed(sorted(weights.items(),
  93 + key=operator.itemgetter(1))))
  94 + #print sorted_weights
  95 + return sorted_weights
  96 +
  97 +def tfidf_plus(index,docs,content_filter):
  98 + """
  99 + Return a dictionary of terms and weights of all terms of a set of
  100 + documents, based on the frequency of terms in the selected set (docids).
  101 + """
  102 + return tfidf_weighting(index,docs,content_filter,1)
  103 +
68 104 class AppAptXapianIndex(xapian.WritableDatabase):
69 105 """
70 106 Data source for application packages information
... ... @@ -101,11 +137,107 @@ class SampleAptXapianIndex(xapian.WritableDatabase):
101 137 xapian.DB_CREATE_OR_OVERWRITE)
102 138 sample = axi_search_pkgs(axi,pkgs_list)
103 139 for package in sample:
104   - doc_id = self.add_document(axi.get_document(package))
  140 + doc_id = self.add_document(axi.get_document(package.docid))
105 141  
106 142 def __str__(self):
107 143 return print_index(self)
108 144  
  145 +class DebianPackage():
  146 + """
  147 + Class to load package information.
  148 + """
  149 + def __init__(self,pkg_name):
  150 + self.name = pkg_name
  151 +
  152 + def load_details_from_apt(self):
  153 + pkg_version = apt.Cache()[self.name].candidate
  154 +
  155 + self.maintainer = pkg_version.record['Maintainer']
  156 + self.version = pkg_version.version
  157 + self.summary = pkg_version.summary
  158 + self.description = self.format_description(pkg_version.description)
  159 + self.summary = pkg_version.section
  160 + if pkg_version.record.has_key('Homepage'):
  161 + self.homepage = pkg_version.record['Homepage']
  162 + if pkg_version.record.has_key('Tag'):
  163 + self.tags = self.debtags_str_to_dict(pkg_version.record['Tag'])
  164 + if pkg_version.record.has_key('Depends'):
  165 + self.depends = pkg_version.record['Depends']
  166 + if pkg_version.record.has_key('Pre-Depends'):
  167 + self.predepends = pkg_version.record['Pre-Depends']
  168 + if pkg_version.record.has_key('Recommends'):
  169 + self.recommends = pkg_version.record['Recommends']
  170 + if pkg_version.record.has_key('Suggests'):
  171 + self.suggests = pkg_version.record['Suggests']
  172 + if pkg_version.record.has_key('Breaks'):
  173 + self.breaks = pkg_version.record['Breaks']
  174 + if pkg_version.record.has_key('Conflicts'):
  175 + self.conflicts = pkg_version.record['Conflicts']
  176 + if pkg_version.record.has_key('Replaces'):
  177 + self.conflicts = pkg_version.record['Replaces']
  178 + if pkg_version.record.has_key('Provides'):
  179 + self.provides = pkg_version.record['Provides']
  180 +
  181 + def load_details_from_dde(self,dde_server,dde_port):
  182 + json_data = json.load(urllib.urlopen("http://%s:%s/q/udd/packages/all/%s?t=json"
  183 + % dde_server,dde_port,self.name))
  184 +
  185 + self.maintainer = json_data['r']['maintainer']
  186 + self.version = json_data['r']['version']
  187 + self.summary = json_data['r']['description']
  188 + self.description = self.format_description(json_data['r']['long_description'])
  189 + self.section = json_data['r']['section']
  190 + if json_data['r']['homepage']:
  191 + self.conflicts = json_data['r']['homepage']
  192 + if json_data['r']['tag']:
  193 + self.tags = self.debtags_list_to_dict(json_data['r']['tag'])
  194 + if json_data['r']['depends']:
  195 + self.depends = json_data['r']['depends']
  196 + if json_data['r']['pre_depends']:
  197 + self.conflicts = json_data['r']['pre_depends']
  198 + if json_data['r']['recommends']:
  199 + self.conflicts = json_data['r']['recommends']
  200 + if json_data['r']['suggests']:
  201 + self.conflicts = json_data['r']['suggests']
  202 + if json_data['r']['conflicts']:
  203 + self.conflicts = json_data['r']['conflicts']
  204 + if json_data['r']['replaces']:
  205 + self.conflicts = json_data['r']['replaces']
  206 + if json_data['r']['provides']:
  207 + self.conflicts = json_data['r']['provides']
  208 + self.popcon_insts = json_data['r']['popcon']['insts']
  209 +
  210 + def format_description(self,description):
  211 + return description.replace('.\n','').replace('\n','<br />')
  212 +
  213 + def debtags_str_to_dict(self, debtags_str):
  214 + debtags_list = [tag.rstrip(",") for tag in debtags_str.split()]
  215 + return self.debtags_list_to_dict(debtags_list)
  216 +
  217 + def debtags_list_to_dict(self, debtags_list):
  218 + """ input: ['use::editing',
  219 + 'works-with-format::gif',
  220 + 'works-with-format::jpg',
  221 + 'works-with-format::pdf']
  222 + output: {'use': [editing],
  223 + 'works-with-format': ['gif', 'jpg', 'pdf']'}
  224 + """
  225 + debtags = {}
  226 + subtags = []
  227 + for tag in debtags_list:
  228 + match = re.search(r'^(.*)::(.*)$', tag)
  229 + if not match:
  230 + logging.info("Could not parse debtags format from tag: %s", tag)
  231 + facet, subtag = match.groups()
  232 + subtags.append(subtag)
  233 + if facet not in debtags:
  234 + debtags[facet] = subtags
  235 + else:
  236 + debtags[facet].append(subtag)
  237 + subtags = []
  238 + print "debtags_list",debtags
  239 + return debtags
  240 +
109 241 class PopconSubmission():
110 242 def __init__(self,path,user_id=0,binary=1):
111 243 self.packages = dict()
... ... @@ -174,11 +306,11 @@ class PopconXapianIndex(xapian.WritableDatabase):
174 306 self.max_popcon = cfg.max_popcon
175 307 self.valid_pkgs = []
176 308 # file format for filter: one package name per line
177   - with open(os.path.join(cfg.filters,cfg.pkgs_filter)) as valid_pkgs:
  309 + with open(cfg.pkgs_filter) as valid_pkgs:
178 310 self.valid_pkgs = [line.strip() for line in valid_pkgs
179 311 if not line.startswith("#")]
180 312 logging.debug("Considering %d valid packages" % len(self.valid_pkgs))
181   - with open(os.path.join(cfg.filters,"tags")) as valid_tags:
  313 + with open(os.path.join(cfg.filters_dir,"debtags")) as valid_tags:
182 314 self.valid_tags = [line.strip() for line in valid_tags
183 315 if not line.startswith("#")]
184 316 logging.debug("Considering %d valid tags" % len(self.valid_tags))
... ...