Commit 2e9ab843a42a3a87c77db71ac63340d6baa20c3e
Exists in
master
and in
1 other branch
Merge branch 'master' of github.com:tassia/AppRecommender
Showing
11 changed files
with
518 additions
and
416 deletions
Show diff stats
src/config.py
... | ... | @@ -46,8 +46,8 @@ class Config(): |
46 | 46 | self.popcon_index = "~/.app-recommender/popcon_index" |
47 | 47 | self.popcon_dir = "~/.app-recommender/popcon_dir" |
48 | 48 | self.clusters_dir = "~/.app-recommender/clusters_dir" |
49 | - self.strategy = "cta" # defaults to the cheapest one | |
50 | - self.reindex = 0 | |
49 | + self.strategy = "cb" # defaults to the cheapest one | |
50 | + self.weight = "bm25" | |
51 | 51 | self.load_options() |
52 | 52 | self.set_logger() |
53 | 53 | |
... | ... | @@ -63,22 +63,24 @@ class Config(): |
63 | 63 | print " -c, --config=PATH Path to configuration file." |
64 | 64 | print "" |
65 | 65 | print " [ recommender ]" |
66 | - print " -t, --tagsdb=PATH Path to debtags database." | |
67 | - print " -i, --tagsindex=PATH Path to debtags dedicated index." | |
68 | - print " -r, --force-reindex Force reindexing debtags database." | |
69 | 66 | print " -a, --axi=PATH Path to Apt-xapian-index." |
70 | 67 | print " -p, --popconindex=PATH Path to popcon dedicated index." |
71 | 68 | print " -m, --popcondir=PATH Path to popcon submissions dir." |
72 | 69 | print " -l, --clustersdir=PATH Path to popcon clusters dir." |
70 | + print " -w, --weight=OPTION Search weighting scheme." | |
73 | 71 | print " -s, --strategy=OPTION Recommendation strategy." |
74 | 72 | print "" |
73 | + print " [ weight options ] " | |
74 | + print " trad = traditional probabilistic weighting " | |
75 | + print " bm25 = bm25 weighting scheme " | |
76 | + print "" | |
75 | 77 | print " [ strategy options ] " |
76 | - print " ct = content-based using tags " | |
77 | - print " cta = content-based using tags via apt-xapian-index" | |
78 | - print " cp = content-based using package descriptions " | |
78 | + print " cb = content-based " | |
79 | + print " cbt = content-based using only tags as content " | |
80 | + print " cbd = content-based using only package descriptions as content " | |
79 | 81 | print " col = collaborative " |
80 | - print " colct = collaborative through tags content " | |
81 | - print " colcp = collaborative through package descriptions content " | |
82 | + #print " colct = collaborative through tags content " | |
83 | + #print " colcp = collaborative through package descriptions content " | |
82 | 84 | |
83 | 85 | def read_option(self, section, option): |
84 | 86 | """ |
... | ... | @@ -108,19 +110,17 @@ class Config(): |
108 | 110 | self.output_filename = self.read_option('general', 'output') |
109 | 111 | self.config = self.read_option('general', 'config') |
110 | 112 | |
111 | - self.tags_db = self.read_option('recommender', 'tags_db') | |
112 | - self.tags_index = self.read_option('recommender', 'tags_index') | |
113 | - self.reindex = self.read_option('recommender', 'reindex') | |
114 | 113 | self.axi = self.read_option('recommender', 'axi') |
115 | 114 | self.popcon_index = self.read_option('recommender', 'popcon_index') |
116 | 115 | self.popcon_dir = self.read_option('recommender', 'popcon_dir') |
117 | 116 | self.clusters_dir = self.read_option('recommender', 'clusters_dir') |
117 | + self.weight = self.read_option('recommender', 'weight') | |
118 | + self.strategy = self.read_option('recommender', 'strategy') | |
118 | 119 | |
119 | - short_options = "hdvo:c:t:i:ra:p:m:s:" | |
120 | + short_options = "hdvo:c:a:p:m:l:w:s:" | |
120 | 121 | long_options = ["help", "debug", "verbose", "output=", "config=", |
121 | - "tagsdb=", "tagsindex=", "reindex", "axi=", | |
122 | - "popconindex=", "popcondir=", "clustersdir=", | |
123 | - "strategy="] | |
122 | + "axi=", "popconindex=", "popcondir=", "clustersdir=", | |
123 | + "weight=", "strategy="] | |
124 | 124 | try: |
125 | 125 | opts, args = getopt.getopt(sys.argv[1:], short_options, |
126 | 126 | long_options) |
... | ... | @@ -142,12 +142,6 @@ class Config(): |
142 | 142 | self.output = p |
143 | 143 | elif o in ("-c", "--config"): |
144 | 144 | self.config = p |
145 | - elif o in ("-t", "--tagsdb"): | |
146 | - self.tags_db = p | |
147 | - elif o in ("-i", "--tagsindex"): | |
148 | - self.tags_index = p | |
149 | - elif o in ("-r", "--force-reindex"): | |
150 | - self.reindex = 1 | |
151 | 145 | elif o in ("-a", "--axi"): |
152 | 146 | self.axi = p + "/index" |
153 | 147 | self.axi_values = p + "/values" |
... | ... | @@ -157,6 +151,8 @@ class Config(): |
157 | 151 | self.popcon_dir = p |
158 | 152 | elif o in ("-l", "--clustersdir"): |
159 | 153 | self.popcon_dir = p |
154 | + elif o in ("-w", "--weight"): | |
155 | + self.weight = p | |
160 | 156 | elif o in ("-s", "--strategy"): |
161 | 157 | self.strategy = p |
162 | 158 | else: | ... | ... |
src/data.py
... | ... | @@ -35,29 +35,44 @@ from singleton import Singleton |
35 | 35 | import cluster |
36 | 36 | from dissimilarity import * |
37 | 37 | |
38 | -#class Item: | |
39 | -# """ | |
40 | -# Generic item definition. | |
41 | -# """ | |
42 | -# | |
43 | -#class Package(Item): | |
44 | -# """ | |
45 | -# Definition of a GNU/Linux application as a recommender item. | |
46 | -# """ | |
47 | -# def __init__(self,package_name): | |
48 | -# """ | |
49 | -# Set initial attributes. | |
50 | -# """ | |
51 | -# self.package_name = package_name | |
52 | -# | |
53 | -#def normalize_tags(string): | |
54 | -# """ | |
55 | -# Substitute string characters : by _ and - by '. | |
56 | -# Examples: | |
57 | -# admin::package-management -> admin__package'management | |
58 | -# implemented-in::c++ -> implemented-in__c++ | |
59 | -# """ | |
60 | -# return string.replace(':','_').replace('-','\'') | |
38 | +def axi_search_pkgs(axi,pkgs_list): | |
39 | + terms = ["XP"+item for item in pkgs_list] | |
40 | + query = xapian.Query(xapian.Query.OP_OR, terms) | |
41 | + enquire = xapian.Enquire(axi) | |
42 | + enquire.set_query(query) | |
43 | + matches = enquire.get_mset(0,axi.get_doccount()) | |
44 | + return matches | |
45 | + | |
46 | +def axi_search_pkg_tags(axi,pkg): | |
47 | + query = xapian.Query(xapian.Query.OP_OR, "XP"+pkg) | |
48 | + enquire = xapian.Enquire(axi) | |
49 | + enquire.set_query(query) | |
50 | + matches = enquire.get_mset(0,1) | |
51 | + for m in matches: | |
52 | + tags = [term.term for term in axi.get_document(m.docid).termlist() if | |
53 | + term.term.startswith("XT")] | |
54 | + return tags | |
55 | + | |
56 | +class SampleAptXapianIndex(xapian.WritableDatabase): | |
57 | + """ | |
58 | + Sample data source for packages information, mainly useful for tests. | |
59 | + """ | |
60 | + def __init__(self,pkgs_list,axi): | |
61 | + xapian.WritableDatabase.__init__(self,".sample_axi", | |
62 | + xapian.DB_CREATE_OR_OVERWRITE) | |
63 | + sample = axi_search_pkgs(axi,pkgs_list) | |
64 | + self.all_docs = [] | |
65 | + for package in sample: | |
66 | + doc_id = self.add_document(axi.get_document(package.docid)) | |
67 | + self.all_docs.append(doc_id) | |
68 | + | |
69 | + def _print(self): | |
70 | + print "---" | |
71 | + print xapian.WritableDatabase.__repr__(self) | |
72 | + print "---" | |
73 | + for doc_id in self.all_docs: | |
74 | + print [term.term for term in self.get_document(doc_id).termlist()] | |
75 | + print "---" | |
61 | 76 | |
62 | 77 | #[FIXME] get pkg tags from axi and remove load_debtags_db method |
63 | 78 | def load_debtags_db(db_path): |
... | ... | @@ -75,106 +90,6 @@ def load_debtags_db(db_path): |
75 | 90 | logging.error("Could not load DebtagsDB from '%s'." % self.db_path) |
76 | 91 | raise Error |
77 | 92 | |
78 | -#class TagsXapianIndex(xapian.WritableDatabase,Singleton): | |
79 | -# """ | |
80 | -# Data source for tags info defined as a singleton xapian database. | |
81 | -# """ | |
82 | -# def __init__(self,cfg): | |
83 | -# """ | |
84 | -# Set initial attributes. | |
85 | -# """ | |
86 | -# self.path = os.path.expanduser(cfg.tags_index) | |
87 | -# self.db_path = os.path.expanduser(cfg.tags_db) | |
88 | -# self.debtags_db = debtags.DB() | |
89 | -# try: | |
90 | -# db_file = open(self.db_path) | |
91 | -# except IOError: | |
92 | -# logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | |
93 | -# raise Error | |
94 | -# md5 = hashlib.md5() | |
95 | -# md5.update(db_file.read()) | |
96 | -# self.db_md5 = md5.hexdigest() | |
97 | -# db_file.close() | |
98 | -# self.load_index(cfg.reindex) | |
99 | -# | |
100 | -## def load_db(self): | |
101 | -## """ | |
102 | -## Load debtags database from the source file. | |
103 | -## """ | |
104 | -## tag_filter = re.compile(r"^special::.+$|^.+::TODO$") | |
105 | -## try: | |
106 | -## db_file = open(self.db_path, "r") | |
107 | -## self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) | |
108 | -## db_file.close() | |
109 | -## except: | |
110 | -## logging.error("Could not load DebtagsDB from '%s'." % self.db_path) | |
111 | -## raise Error | |
112 | -# | |
113 | -# def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): | |
114 | -# """ | |
115 | -# Return most relevant tags considering a list of packages. | |
116 | -# """ | |
117 | -# if not self.debtags_db.package_count(): | |
118 | -# #print "index vazio" | |
119 | -# self.debtags_db = load_debtags_db(self.db_path) | |
120 | -# relevant_db = self.debtags_db.choose_packages(pkgs_list) | |
121 | -# relevance_index = debtags.relevance_index_function(self.debtags_db, | |
122 | -# relevant_db) | |
123 | -# sorted_relevant_tags = sorted(relevant_db.iter_tags(), | |
124 | -# lambda a, b: cmp(relevance_index(a), | |
125 | -# relevance_index(b))) | |
126 | -# return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:])) | |
127 | -# | |
128 | -# def load_index(self,reindex): | |
129 | -# """ | |
130 | -# Load an existing debtags index. | |
131 | -# """ | |
132 | -# if not reindex: | |
133 | -# try: | |
134 | -# logging.info("Opening existing debtags xapian index at \'%s\'" | |
135 | -# % self.path) | |
136 | -# xapian.Database.__init__(self,self.path) | |
137 | -# md5 = self.get_metadata("md5") | |
138 | -# if not md5 == self.db_md5: | |
139 | -# logging.info("Index must be updated.") | |
140 | -# reindex = 1 | |
141 | -# except xapian.DatabaseError: | |
142 | -# logging.info("Could not open debtags index.") | |
143 | -# reindex =1 | |
144 | -# | |
145 | -# if reindex: | |
146 | -# self.new_index() | |
147 | -# | |
148 | -# def new_index(self): | |
149 | -# """ | |
150 | -# Create a xapian index for debtags info based on 'debtags_db' and | |
151 | -# place it at 'self.path'. | |
152 | -# """ | |
153 | -# if not os.path.exists(self.path): | |
154 | -# os.makedirs(self.path) | |
155 | -# | |
156 | -# try: | |
157 | -# logging.info("Indexing debtags info from \'%s\'" % | |
158 | -# self.db_path) | |
159 | -# logging.info("Creating new xapian index at \'%s\'" % | |
160 | -# self.path) | |
161 | -# xapian.WritableDatabase.__init__(self,self.path, | |
162 | -# xapian.DB_CREATE_OR_OVERWRITE) | |
163 | -# except xapian.DatabaseError: | |
164 | -# logging.critical("Could not create xapian index.") | |
165 | -# raise Error | |
166 | -# | |
167 | -# self.debtags_db = load_debtags_db(self.db_path) | |
168 | -# self.set_metadata("md5",self.db_md5) | |
169 | -# | |
170 | -# for pkg,tags in self.debtags_db.iter_packages_tags(): | |
171 | -# doc = xapian.Document() | |
172 | -# doc.set_data(pkg) | |
173 | -# for tag in tags: | |
174 | -# doc.add_term(normalize_tags(tag)) | |
175 | -# doc_id = self.add_document(doc) | |
176 | -# logging.debug("Debtags Xapian: Indexing doc %d",doc_id) | |
177 | - | |
178 | 93 | class PopconXapianIndex(xapian.WritableDatabase,Singleton): |
179 | 94 | """ |
180 | 95 | Data source for popcon submissions defined as a singleton xapian database. | ... | ... |
src/recommender.py
... | ... | @@ -19,10 +19,10 @@ __license__ = """ |
19 | 19 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | 20 | """ |
21 | 21 | |
22 | -from operator import itemgetter | |
23 | -from data import * | |
24 | -from strategy import * | |
25 | -from error import Error | |
22 | +import xapian | |
23 | +import operator | |
24 | +import data | |
25 | +import strategy | |
26 | 26 | |
27 | 27 | class RecommendationResult: |
28 | 28 | """ |
... | ... | @@ -40,7 +40,7 @@ class RecommendationResult: |
40 | 40 | """ |
41 | 41 | result = self.get_prediction() |
42 | 42 | str = "\n" |
43 | - for i in range(len(result)): | |
43 | + for i in range(len((list(result)))): | |
44 | 44 | str += "%2d: %s\n" % (i,result[i][0]) |
45 | 45 | return str |
46 | 46 | |
... | ... | @@ -48,8 +48,10 @@ class RecommendationResult: |
48 | 48 | """ |
49 | 49 | Return prediction based on recommendation size (number of items). |
50 | 50 | """ |
51 | - sorted_result = sorted(self.item_score.items(), key=itemgetter(1)) | |
52 | - return reversed(sorted_result[-size:]) | |
51 | + if size > len(self.item_score): size = len(self.item_score) | |
52 | + sorted_result = sorted(self.item_score.items(), | |
53 | + key=operator.itemgetter(1)) | |
54 | + return list(reversed(sorted_result[-size:])) | |
53 | 55 | |
54 | 56 | class Recommender: |
55 | 57 | """ |
... | ... | @@ -59,47 +61,30 @@ class Recommender: |
59 | 61 | """ |
60 | 62 | Set initial parameters. |
61 | 63 | """ |
62 | - try: | |
63 | - strategy = "self."+cfg.strategy+"(cfg)" | |
64 | - exec(strategy) | |
65 | - except (NameError, AttributeError, SyntaxError) as err: | |
66 | - print err | |
67 | - logging.critical("Could not perform recommendation strategy '%s'" % | |
68 | - cfg.strategy) | |
69 | - raise Error | |
70 | - | |
71 | - def ct(self,cfg): | |
72 | - """ | |
73 | - Set recommender attributes to perform content-based recommendation | |
74 | - using tags index as source data. | |
75 | - """ | |
76 | - self.items_repository = TagsXapianIndex(cfg) | |
77 | - self.strategy = ContentBasedStrategy() | |
78 | - | |
79 | - def cta(self,cfg): | |
80 | - """ | |
81 | - Set recommender attributes to perform content-based recommendation | |
82 | - using apt-xapian-index as source data. | |
83 | - """ | |
84 | 64 | self.items_repository = xapian.Database(cfg.axi) |
85 | - self.strategy = AxiContentBasedStrategy() | |
86 | - | |
87 | - def col(self,cfg): | |
88 | - """ | |
89 | - Set recommender attributes to perform collaborative recommendation | |
90 | - using popcon-xapian-index as source data. | |
91 | - """ | |
92 | - self.users_repository = PopconXapianIndex(cfg) | |
93 | - self.strategy = CollaborativeStrategy() | |
65 | + self.users_repository = data.PopconXapianIndex(cfg) #[FIXME] only cfg fields | |
66 | + self.clustered_users_repository = data.PopconXapianIndex(cfg) #[FIXME] | |
67 | + self.set_strategy(cfg.strategy) | |
68 | + if cfg.weight == "bm25": | |
69 | + self.weight = xapian.BM25Weight() | |
70 | + else: | |
71 | + self.weight = xapian.TradWeight() | |
94 | 72 | |
95 | - def set_strategy(self,strategy): | |
73 | + def set_strategy(self,strategy_str): | |
96 | 74 | """ |
97 | 75 | Set the recommendation strategy. |
98 | 76 | """ |
99 | - self.strategy = strategy | |
77 | + if strategy_str == "cb": | |
78 | + self.strategy = strategy.ContentBasedStrategy("full") | |
79 | + if strategy_str == "cbt": | |
80 | + self.strategy = strategy.ContentBasedStrategy("tag") | |
81 | + if strategy_str == "cbd": | |
82 | + self.strategy = strategy.ContentBasedStrategy("desc") | |
83 | + if strategy_str == "col": | |
84 | + self.strategy = strategy.CollaborativeStrategy(20) | |
100 | 85 | |
101 | - def get_recommendation(self,user): | |
86 | + def get_recommendation(self,user,limit=20): | |
102 | 87 | """ |
103 | 88 | Produces recommendation using previously loaded strategy. |
104 | 89 | """ |
105 | - return self.strategy.run(self,user) | |
90 | + return self.strategy.run(self,user,limit) | ... | ... |
src/strategy.py
... | ... | @@ -20,54 +20,27 @@ __license__ = """ |
20 | 20 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
21 | 21 | """ |
22 | 22 | |
23 | -import string | |
24 | -import os, re | |
25 | 23 | import xapian |
26 | -from data import * | |
27 | 24 | from singleton import Singleton |
28 | 25 | import recommender |
29 | - | |
30 | -class ReputationHeuristic(Singleton): | |
31 | - """ | |
32 | - Abstraction for diferent reputation heuristics. | |
33 | - """ | |
34 | - pass | |
35 | - | |
36 | -class BugsHeuristic(ReputationHeuristic): | |
37 | - """ | |
38 | - Reputation heuristic based on quantity of open bugs. | |
39 | - """ | |
40 | - pass | |
41 | - | |
42 | -class RCBugsHeuristic(ReputationHeuristic): | |
43 | - """ | |
44 | - Reputation heuristic based on quantity of RC bugs. | |
45 | - """ | |
46 | - pass | |
47 | - | |
48 | -class PopularityHeuristic(ReputationHeuristic): | |
49 | - """ | |
50 | - Reputation heuristic based on popularity of packages. | |
51 | - """ | |
52 | - pass | |
26 | +from data import * | |
53 | 27 | |
54 | 28 | class PkgMatchDecider(xapian.MatchDecider): |
55 | 29 | """ |
56 | 30 | Extend xapian.MatchDecider to not consider installed packages. |
57 | 31 | """ |
58 | - | |
59 | - def __init__(self, installed_pkgs): | |
32 | + def __init__(self, pkgs_list): | |
60 | 33 | """ |
61 | 34 | Set initial parameters. |
62 | 35 | """ |
63 | 36 | xapian.MatchDecider.__init__(self) |
64 | - self.installed_pkgs = installed_pkgs | |
37 | + self.pkgs_list = pkgs_list | |
65 | 38 | |
66 | 39 | def __call__(self, doc): |
67 | 40 | """ |
68 | 41 | True if the package is not already installed. |
69 | 42 | """ |
70 | - return doc.get_data() not in self.installed_pkgs | |
43 | + return doc.get_data() not in self.pkgs_list | |
71 | 44 | |
72 | 45 | class UserMatchDecider(xapian.MatchDecider): |
73 | 46 | """ |
... | ... | @@ -80,51 +53,35 @@ class UserMatchDecider(xapian.MatchDecider): |
80 | 53 | """ |
81 | 54 | xapian.MatchDecider.__init__(self) |
82 | 55 | self.profile = profile |
83 | - print "mdecider:",profile | |
84 | 56 | |
85 | 57 | def __call__(self, doc): |
86 | 58 | """ |
87 | 59 | True if the user has more the half of packages from profile. |
88 | 60 | """ |
89 | - profile_size = len(self.profile) | |
90 | - pkg_match=0 | |
61 | + match=0 | |
91 | 62 | for term in doc: |
92 | 63 | if term.term in self.profile: |
93 | - pkg_match = pkg_match+1 | |
94 | - print "id",doc.get_docid(),"match",pkg_match | |
95 | - return pkg_match >= profile_size/2 | |
64 | + match = match+1 | |
65 | + return (match >= len(self.profile)/2) | |
96 | 66 | |
97 | 67 | class PkgExpandDecider(xapian.ExpandDecider): |
98 | 68 | """ |
99 | 69 | Extend xapian.ExpandDecider to consider packages only. |
100 | 70 | """ |
101 | - | |
102 | - def __init__(self): | |
103 | - """ | |
104 | - Call base class init. | |
105 | - """ | |
106 | - xapian.ExpandDecider.__init__(self) | |
107 | - | |
108 | 71 | def __call__(self, term): |
109 | 72 | """ |
110 | 73 | True if the term is a package. |
111 | 74 | """ |
75 | + # [FIXME] return term.startswith("XP") | |
112 | 76 | return not term.startswith("XT") |
113 | 77 | |
114 | 78 | class TagExpandDecider(xapian.ExpandDecider): |
115 | 79 | """ |
116 | 80 | Extend xapian.ExpandDecider to consider tags only. |
117 | 81 | """ |
118 | - | |
119 | - def __init__(self, profile): | |
120 | - """ | |
121 | - Call base class init. | |
122 | - """ | |
123 | - xapian.ExpandDecider.__init__(self) | |
124 | - | |
125 | - def __call__(self, doc): | |
82 | + def __call__(self, term): | |
126 | 83 | """ |
127 | - True if the user has more the half of packages from profile. | |
84 | + True if the term is a tag. | |
128 | 85 | """ |
129 | 86 | return term.startswith("XT") |
130 | 87 | |
... | ... | @@ -134,65 +91,30 @@ class RecommendationStrategy: |
134 | 91 | """ |
135 | 92 | pass |
136 | 93 | |
137 | -class ItemReputationStrategy(RecommendationStrategy): | |
138 | - """ | |
139 | - Recommendation strategy based on items reputation. | |
140 | - """ | |
141 | - def run(self,items_list,heuristic): | |
142 | - """ | |
143 | - Perform recommendation strategy. | |
144 | - """ | |
145 | - logging.critical("Item reputation recommendation strategy is not yet implemented.") | |
146 | - raise Error | |
147 | - | |
148 | -#class ContentBasedStrategy(RecommendationStrategy): | |
149 | -# """ | |
150 | -# Content-based recommendation strategy. | |
151 | -# """ | |
152 | -# def run(self,rec,user): | |
153 | -# """ | |
154 | -# Perform recommendation strategy. | |
155 | -# """ | |
156 | -# profile = user.txi_tag_profile(rec.items_repository,50) | |
157 | -# qp = xapian.QueryParser() | |
158 | -# query = qp.parse_query(profile) | |
159 | -# enquire = xapian.Enquire(rec.items_repository) | |
160 | -# enquire.set_query(query) | |
161 | -# | |
162 | -# try: | |
163 | -# mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | |
164 | -# except xapian.DatabaseError as error: | |
165 | -# logging.critical(error.get_msg()) | |
166 | -# raise Error | |
167 | -# | |
168 | -# item_score = {} | |
169 | -# for m in mset: | |
170 | -# item_score[m.document.get_data()] = m.rank | |
171 | -# return recommender.RecommendationResult(item_score,20) | |
172 | - | |
173 | -class AxiContentBasedStrategy(RecommendationStrategy): | |
94 | +class ContentBasedStrategy(RecommendationStrategy): | |
174 | 95 | """ |
175 | 96 | Content-based recommendation strategy based on Apt-xapian-index. |
176 | 97 | """ |
177 | - def __init__(self): | |
98 | + def __init__(self,content): | |
178 | 99 | self.description = "Content-based" |
100 | + self.content = content | |
179 | 101 | |
180 | - def run(self,rec,user): | |
102 | + def run(self,rec,user,limit): | |
181 | 103 | """ |
182 | 104 | Perform recommendation strategy. |
183 | 105 | """ |
184 | - profile = user.axi_tag_profile(rec.items_repository,50) | |
185 | - #profile_str = string.join(list(profile),' ') | |
186 | - query = xapian.Query(xapian.Query.OP_OR,list(profile)) | |
106 | + profile = user.profile(rec.items_repository,self.content,50) | |
107 | + # prepair index for querying user profile | |
108 | + query = xapian.Query(xapian.Query.OP_OR,profile) | |
187 | 109 | enquire = xapian.Enquire(rec.items_repository) |
110 | + enquire.set_weighting_scheme(rec.weight) | |
188 | 111 | enquire.set_query(query) |
189 | - | |
190 | 112 | try: |
191 | - mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | |
113 | + # retrieve matching packages | |
114 | + mset = enquire.get_mset(0, limit, None, PkgMatchDecider(user.items())) | |
192 | 115 | except xapian.DatabaseError as error: |
193 | - logging.critical(error.get_msg()) | |
194 | - raise Error | |
195 | - | |
116 | + logging.critical("Content-based strategy: "+error.get_msg()) | |
117 | + # compose result dictionary | |
196 | 118 | item_score = {} |
197 | 119 | for m in mset: |
198 | 120 | item_score[m.document.get_data()] = m.weight |
... | ... | @@ -202,66 +124,107 @@ class CollaborativeStrategy(RecommendationStrategy): |
202 | 124 | """ |
203 | 125 | Colaborative recommendation strategy. |
204 | 126 | """ |
205 | - def __init__(self): | |
127 | + def __init__(self,k,clustering=1): | |
206 | 128 | self.description = "Collaborative" |
129 | + self.clustering = clustering | |
130 | + self.neighbours = k | |
207 | 131 | |
208 | - #def run(self,rec,user,similarity_measure): | |
209 | - def run(self,rec,user): | |
132 | + def run(self,rec,user,limit): | |
210 | 133 | """ |
211 | 134 | Perform recommendation strategy. |
212 | 135 | """ |
213 | - profile = user.maximal_pkg_profile() | |
214 | - #profile_str = string.join(list(profile),' ') | |
215 | - query = xapian.Query(xapian.Query.OP_OR,list(profile)) | |
216 | - enquire = xapian.Enquire(rec.users_repository) | |
136 | + profile = user.pkg_profile | |
137 | + # prepair index for querying user profile | |
138 | + query = xapian.Query(xapian.Query.OP_OR,profile) | |
139 | + if self.clustering: | |
140 | + enquire = xapian.Enquire(rec.clustered_users_repository) | |
141 | + else: | |
142 | + enquire = xapian.Enquire(rec.users_repository) | |
143 | + enquire.set_weighting_scheme(rec.weight) | |
217 | 144 | enquire.set_query(query) |
218 | - | |
219 | 145 | try: |
220 | - #mset = enquire.get_mset(0, 182, None, UserMatchDecider(profile)) | |
221 | - mset = enquire.get_mset(0, 20) | |
146 | + # retrieve matching users | |
147 | + mset = enquire.get_mset(0, self.neighbours) | |
222 | 148 | except xapian.DatabaseError as error: |
223 | - logging.critical(error.get_msg()) | |
224 | - raise Error | |
225 | - | |
149 | + logging.critical("Collaborative strategy: "+error.get_msg()) | |
226 | 150 | rset = xapian.RSet() |
151 | + logging.debug("Neighborhood composed by the following users (by hash)") | |
227 | 152 | for m in mset: |
228 | 153 | rset.add_document(m.document.get_docid()) |
229 | - logging.debug("Counting as relevant submission %s" % | |
230 | - m.document.get_data()) | |
231 | - | |
232 | - eset = enquire.get_eset(20,rset,PkgExpandDecider()) | |
233 | - rank = 0 | |
154 | + logging.debug(m.document.get_data()) | |
155 | + # retrieve most relevant packages | |
156 | + eset = enquire.get_eset(limit,rset,PkgExpandDecider()) | |
157 | + # compose result dictionary | |
234 | 158 | item_score = {} |
235 | - for term in eset: | |
236 | - item_score[term.term] = rank | |
237 | - rank = rank+1 | |
238 | - | |
159 | + for package in eset: | |
160 | + item_score[package.term.lstrip("XP")] = package.weight | |
239 | 161 | return recommender.RecommendationResult(item_score) |
240 | 162 | |
163 | +class DemographicStrategy(RecommendationStrategy): | |
164 | + """ | |
165 | + Recommendation strategy based on demographic data. | |
166 | + """ | |
167 | + def __init__(self): | |
168 | + self.description = "Demographic" | |
169 | + logging.debug("Demographic recommendation not yet implemented.") | |
170 | + raise Error | |
171 | + | |
172 | + def run(self,user,items_repository): | |
173 | + """ | |
174 | + Perform recommendation strategy. | |
175 | + """ | |
176 | + pass | |
177 | + | |
241 | 178 | class KnowledgeBasedStrategy(RecommendationStrategy): |
242 | 179 | """ |
243 | 180 | Knowledge-based recommendation strategy. |
244 | 181 | """ |
245 | 182 | def __init__(self): |
246 | 183 | self.description = "Knowledge-based" |
184 | + logging.debug("Knowledge-based recommendation not yet implemented.") | |
185 | + raise Error | |
247 | 186 | |
248 | 187 | def run(self,user,knowledge_repository): |
249 | 188 | """ |
250 | 189 | Perform recommendation strategy. |
251 | 190 | """ |
252 | - logging.critical("Knowledge-based recommendation strategy is not yet implemented.") | |
253 | - raise Error | |
191 | + pass | |
254 | 192 | |
255 | -class DemographicStrategy(RecommendationStrategy): | |
193 | +class ReputationHeuristic(Singleton): | |
256 | 194 | """ |
257 | - Recommendation strategy based on demographic data. | |
195 | + Abstraction for diferent reputation heuristics. | |
196 | + """ | |
197 | + pass | |
198 | + | |
199 | +class BugsHeuristic(ReputationHeuristic): | |
200 | + """ | |
201 | + Reputation heuristic based on quantity of open bugs. | |
202 | + """ | |
203 | + pass | |
204 | + | |
205 | +class RCBugsHeuristic(ReputationHeuristic): | |
206 | + """ | |
207 | + Reputation heuristic based on quantity of RC bugs. | |
208 | + """ | |
209 | + pass | |
210 | + | |
211 | +class PopularityHeuristic(ReputationHeuristic): | |
212 | + """ | |
213 | + Reputation heuristic based on popularity of packages. | |
214 | + """ | |
215 | + pass | |
216 | + | |
217 | +class ItemReputationStrategy(RecommendationStrategy): | |
218 | + """ | |
219 | + Recommendation strategy based on items reputation. | |
258 | 220 | """ |
259 | 221 | def __init__(self): |
260 | - self.description = "Demographic" | |
222 | + self.description = "Item reputation" | |
223 | + logging.debug("Item reputation recommendation not yet implemented.") | |
224 | + raise Error | |
261 | 225 | |
262 | - def run(self,user,items_repository): | |
226 | + def run(self,items_list,heuristic): | |
263 | 227 | """ |
264 | 228 | Perform recommendation strategy. |
265 | 229 | """ |
266 | - logging.critical("Demographic recommendation strategy is not yet implemented.") | |
267 | - raise Error | |
230 | + pass | ... | ... |
src/tests/package-xapian-index
... | ... | @@ -1,10 +0,0 @@ |
1 | -aaphoto: implemented-in::c, interface::commandline, role::program, use::editing, works-with::image | |
2 | -dia: implemented-in::c, interface::x11, role::program, scope::application, suite::gnu, uitoolkit::gtk, use::editing, works-with::image, works-with::image:vector, x11::application | |
3 | -eog: implemented-in::c, interface::x11, role::program, scope::application, suite::gnome, uitoolkit::gtk, use::viewing, works-with-format::jpg, works-with-format::png, works-with::image, works-with::image:raster, works-with::image:vector, x11::application | |
4 | -emacs: devel::editor, role::dummy, role::metapackage, special::meta, suite::emacs, suite::gnu, use::editing | |
5 | -ferret: devel::modelling, role::program, scope::application, suite::gnu, works-with::db | |
6 | -festival: accessibility::speech, devel::interpreter, implemented-in::scheme, interface::text-mode, network::client, network::server, role::program, sound::speech, uitoolkit::ncurses, works-with::audio | |
7 | -file: admin::forensics, implemented-in::c, interface::commandline, role::program, scope::utility, use::analysing, use::scanning, works-with::file | |
8 | -gimp: implemented-in::c, interface::x11, role::program, scope::application, suite::gimp, suite::gnu, uitoolkit::gtk, use::editing, works-with-format::gif, works-with-format::jpg, works-with-format::pdf, works-with-format::png, works-with-format::tiff, works-with::image, works-with::image:raster, works-with::text, x11::application | |
9 | -inkscape: implemented-in::c, implemented-in::c++, interface::x11, role::program, scope::application, uitoolkit::gtk, use::editing, works-with-format::pdf, works-with-format::postscript, works-with-format::svg, works-with-format::tex, works-with::image, works-with::image:vector, x11::application | |
10 | -xpdf: implemented-in::c++, interface::x11, role::program, scope::application, uitoolkit::motif, use::viewing, works-with-format::pdf, works-with::text, x11::application |
... | ... | @@ -0,0 +1,69 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + recommenderTests - Recommender class test case | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import unittest2 | |
23 | +import sys | |
24 | +sys.path.insert(0,'../') | |
25 | +from recommender import RecommendationResult, Recommender | |
26 | +from user import User | |
27 | +from config import Config | |
28 | +from strategy import ContentBasedStrategy, CollaborativeStrategy | |
29 | + | |
30 | +class RecommendationResultTests(unittest2.TestCase): | |
31 | + @classmethod | |
32 | + def setUpClass(self): | |
33 | + self.result = RecommendationResult({"gimp":1.5,"inkscape":3.0,"eog":1}) | |
34 | + | |
35 | + def test_str(self): | |
36 | + string = "\n 0: inkscape\n 1: gimp\n 2: eog\n" | |
37 | + self.assertEqual(self.result.__str__(),string) | |
38 | + | |
39 | + def test_get_prediction(self): | |
40 | + prediction = [("inkscape",3.0),("gimp",1.5),("eog",1)] | |
41 | + self.assertEqual(self.result.get_prediction(),prediction) | |
42 | + | |
43 | +class RecommenderTests(unittest2.TestCase): | |
44 | + @classmethod | |
45 | + def setUpClass(self): | |
46 | + cfg = Config() | |
47 | + self.rec = Recommender(cfg) | |
48 | + | |
49 | + def test_set_strategy(self): | |
50 | + self.rec.set_strategy("cb") | |
51 | + self.assertIsInstance(self.rec.strategy,ContentBasedStrategy) | |
52 | + self.assertEqual(self.rec.strategy.content,"full") | |
53 | + self.rec.set_strategy("cbt") | |
54 | + self.assertIsInstance(self.rec.strategy,ContentBasedStrategy) | |
55 | + self.assertEqual(self.rec.strategy.content,"tag") | |
56 | + self.rec.set_strategy("cbd") | |
57 | + self.assertIsInstance(self.rec.strategy,ContentBasedStrategy) | |
58 | + self.assertEqual(self.rec.strategy.content,"desc") | |
59 | + self.rec.set_strategy("col") | |
60 | + self.assertIsInstance(self.rec.strategy,CollaborativeStrategy) | |
61 | + | |
62 | + def test_get_recommendation(self): | |
63 | + user = User({"inkscape": 1, "gimp": 1, "eog":1}) | |
64 | + result = self.rec.get_recommendation(user) | |
65 | + self.assertIsInstance(result, RecommendationResult) | |
66 | + self.assertGreater(len(result.item_score),0) | |
67 | + | |
68 | +if __name__ == '__main__': | |
69 | + unittest2.main() | ... | ... |
src/tests/runner.py
1 | 1 | #!/usr/bin/env python |
2 | 2 | """ |
3 | - tests - execution of the whole set of tests suites. | |
3 | + runner - Run the whole set of test cases suites. | |
4 | 4 | """ |
5 | 5 | __author__ = "Tassia Camoes Araujo <tassia@gmail.com>" |
6 | 6 | __copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" |
... | ... | @@ -20,9 +20,30 @@ __license__ = """ |
20 | 20 | """ |
21 | 21 | |
22 | 22 | import unittest2 |
23 | -import user_tests | |
24 | -import singleton_tests | |
23 | +from user_tests import UserTests, FilterTagTests, FilterDescriptionTests | |
24 | +from recommender_tests import RecommendationResultTests, RecommenderTests | |
25 | +from strategy_tests import (PkgMatchDeciderTests, UserMatchDeciderTests, | |
26 | + PkgExpandDeciderTests, TagExpandDeciderTests, ContentBasedStrategyTests, | |
27 | + CollaborativeStrategyTests, DemographicStrategyTests, | |
28 | + KnowledgeBasedStrategyTests, ItemReputationStrategyTests) | |
29 | +from singleton_tests import SingletonTests | |
30 | + | |
31 | +def load_tests(test_cases): | |
32 | + suite = unittest2.TestSuite() | |
33 | + for test_class in test_cases: | |
34 | + tests = unittest2.TestLoader().loadTestsFromTestCase(test_class) | |
35 | + suite.addTests(tests) | |
36 | + return suite | |
37 | + | |
38 | +test_lists = [[UserTests, FilterTagTests, FilterDescriptionTests], | |
39 | + [RecommendationResultTests, RecommenderTests], | |
40 | + [PkgMatchDeciderTests, UserMatchDeciderTests, | |
41 | + PkgExpandDeciderTests, TagExpandDeciderTests, | |
42 | + ContentBasedStrategyTests, CollaborativeStrategyTests, | |
43 | + DemographicStrategyTests, KnowledgeBasedStrategyTests, | |
44 | + ItemReputationStrategyTests], | |
45 | + [SingletonTests]] | |
25 | 46 | |
26 | 47 | runner = unittest2.TextTestRunner() |
27 | -runner.run(user_tests.suite()) | |
28 | -runner.run(singleton_tests.suite()) | |
48 | +for module in test_lists: | |
49 | + runner.run(load_tests(module)) | ... | ... |
src/tests/singleton_tests.py
... | ... | @@ -24,9 +24,6 @@ import sys |
24 | 24 | sys.path.insert(0,'../') |
25 | 25 | from singleton import Singleton |
26 | 26 | |
27 | -def suite(): | |
28 | - return unittest2.TestLoader().loadTestsFromTestCase(SingletonTests) | |
29 | - | |
30 | 27 | class SingletonTests(unittest2.TestCase): |
31 | 28 | def test_creation(self): |
32 | 29 | object_1 = Singleton() | ... | ... |
... | ... | @@ -0,0 +1,116 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + strategyTests - Recommendation strategies classes test case | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | + | |
22 | +import unittest2 | |
23 | +import xapian | |
24 | +import sys | |
25 | +sys.path.insert(0,'../') | |
26 | +from error import Error | |
27 | +from user import User | |
28 | +from recommender import RecommendationResult | |
29 | +from config import * | |
30 | +#from data import * | |
31 | +from strategy import (PkgMatchDecider, UserMatchDecider, PkgExpandDecider, | |
32 | + TagExpandDecider, ContentBasedStrategy, | |
33 | + CollaborativeStrategy, DemographicStrategy, | |
34 | + KnowledgeBasedStrategy, ItemReputationStrategy) | |
35 | + | |
36 | +class PkgMatchDeciderTests(unittest2.TestCase): | |
37 | + @classmethod | |
38 | + def setUpClass(self): | |
39 | + pkgs_list = ["gimp","eog","inkscape"] | |
40 | + self.decider = PkgMatchDecider(pkgs_list) | |
41 | + self.doc = xapian.Document() | |
42 | + | |
43 | + def test_match(self): | |
44 | + self.doc.set_data("emacs") | |
45 | + self.assertTrue(self.decider(self.doc)) | |
46 | + | |
47 | + def test_no_match(self): | |
48 | + self.doc.set_data("gimp") | |
49 | + self.assertFalse(self.decider(self.doc)) | |
50 | + | |
51 | +class UserMatchDeciderTests(unittest2.TestCase): | |
52 | + @classmethod | |
53 | + def setUpClass(self): | |
54 | + user_profile = ["gimp","eog","inkscape", "emacs"] | |
55 | + self.decider = UserMatchDecider(user_profile) | |
56 | + | |
57 | + def setUp(self): | |
58 | + self.doc = xapian.Document() | |
59 | + | |
60 | + def test_match(self): | |
61 | + self.doc.add_term("emacs") | |
62 | + self.doc.add_term("gimp") | |
63 | + self.doc.add_term("eog") | |
64 | + self.assertTrue(self.decider(self.doc)) | |
65 | + | |
66 | + def test_no_match(self): | |
67 | + self.doc.add_term("gimp") | |
68 | + self.assertFalse(self.decider(self.doc)) | |
69 | + | |
70 | +class PkgExpandDeciderTests(unittest2.TestCase): | |
71 | + @classmethod | |
72 | + def setUpClass(self): | |
73 | + self.decider = PkgExpandDecider() | |
74 | + | |
75 | + def test_match(self): | |
76 | + self.assertTrue(self.decider("XPgimp")) | |
77 | + | |
78 | + def test_no_match(self): | |
79 | + self.assertFalse(self.decider("XTgimp")) | |
80 | + | |
81 | +class TagExpandDeciderTests(unittest2.TestCase): | |
82 | + @classmethod | |
83 | + def setUpClass(self): | |
84 | + self.decider = TagExpandDecider() | |
85 | + | |
86 | + def test_match(self): | |
87 | + self.assertTrue(self.decider("XTgimp")) | |
88 | + | |
89 | + def test_no_match(self): | |
90 | + self.assertFalse(self.decider("gimp")) | |
91 | + | |
92 | +class ContentBasedStrategyTests(unittest2.TestCase): | |
93 | + @classmethod | |
94 | + def setUpClass(self): | |
95 | + | |
96 | + pass | |
97 | + | |
98 | +class CollaborativeStrategyTests(unittest2.TestCase): | |
99 | + @classmethod | |
100 | + def setUpClass(self): | |
101 | + pass | |
102 | + | |
103 | +class DemographicStrategyTests(unittest2.TestCase): | |
104 | + def test_call(self): | |
105 | + self.assertRaises(Error,lambda: DemographicStrategy()) | |
106 | + | |
107 | +class KnowledgeBasedStrategyTests(unittest2.TestCase): | |
108 | + def test_call(self): | |
109 | + self.assertRaises(Error,lambda: KnowledgeBasedStrategy()) | |
110 | + | |
111 | +class ItemReputationStrategyTests(unittest2.TestCase): | |
112 | + def test_call(self): | |
113 | + self.assertRaises(Error,lambda: ItemReputationStrategy()) | |
114 | + | |
115 | +if __name__ == '__main__': | |
116 | + unittest2.main() | ... | ... |
src/tests/user_tests.py
... | ... | @@ -19,26 +19,39 @@ __license__ = """ |
19 | 19 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | 20 | """ |
21 | 21 | |
22 | -import operator | |
23 | -import math | |
24 | 22 | import unittest2 |
25 | 23 | import xapian |
26 | 24 | import sys |
27 | 25 | sys.path.insert(0,'../') |
28 | -from user import * | |
29 | -from config import * | |
30 | -from data import * | |
26 | +from user import User, FilterTag, FilterDescription | |
27 | +from config import Config | |
28 | +from data import SampleAptXapianIndex | |
31 | 29 | |
32 | -def suite(): | |
33 | - return unittest2.TestLoader().loadTestsFromTestCase(UserTests) | |
30 | +class FilterTagTests(unittest2.TestCase): | |
31 | + def test_call_true(self): | |
32 | + self.assertTrue(FilterTag()("XTrole::program")) | |
33 | + | |
34 | + def test_call_false(self): | |
35 | + self.assertFalse(FilterTag()("role::program")) | |
36 | + | |
37 | +class FilterDescriptionTests(unittest2.TestCase): | |
38 | + def test_call_true(self): | |
39 | + self.assertTrue(FilterDescription()("program")) | |
40 | + #self.assertTrue(FilterDescription()("Zprogram")) | |
41 | + | |
42 | + def test_call_false(self): | |
43 | + self.assertFalse(FilterDescription()("XTprogram")) | |
34 | 44 | |
35 | 45 | class UserTests(unittest2.TestCase): |
36 | 46 | @classmethod |
37 | 47 | def setUpClass(self): |
38 | 48 | cfg = Config() |
39 | - #self.axi = xapian.Database(cfg.axi) | |
49 | + self.axi = xapian.Database(cfg.axi) | |
50 | + sample_packages = ["gimp","aaphoto","eog","emacs","dia","ferret", | |
51 | + "festival","file","inkscape","xpdf"] | |
52 | + self.sample_axi = SampleAptXapianIndex(sample_packages,self.axi) | |
40 | 53 | self.user = User({"gimp":1,"aaphoto":1,"eog":1,"emacs":1}) |
41 | - self.pxi = PkgXapianIndex("package-xapian-index") | |
54 | + #self.sample_axi._print() | |
42 | 55 | |
43 | 56 | def test_hash(self): |
44 | 57 | new_user = User(dict()) |
... | ... | @@ -100,34 +113,34 @@ class UserTests(unittest2.TestCase): |
100 | 113 | self.assertEqual(self.user.demographic_profile,desktop_art_admin) |
101 | 114 | |
102 | 115 | def test_items(self): |
103 | - self.assertEqual(self.user.items(),set(["gimp","aaphoto","eog","emacs"])) | |
104 | - | |
105 | - def test_axi_tag_profile(self): | |
106 | - package_terms = ["XP"+package for package in self.user.items()] | |
107 | - enquire = xapian.Enquire(self.pxi) | |
108 | - enquire.set_query(xapian.Query(xapian.Query.OP_OR,package_terms)) | |
109 | - user_packages = enquire.get_mset(0, self.pxi.get_doccount(), None, None) | |
110 | - tag_terms = [] | |
111 | - for p in user_packages: | |
112 | - tag_terms = tag_terms + [x.term for x in p.document.termlist() \ | |
113 | - if x.term.startswith("XT")] | |
114 | - relevant_count = dict([(tag,tag_terms.count(tag)) \ | |
115 | - for tag in set(tag_terms)]) | |
116 | - #rank = {} | |
117 | - #non_relevant_count = dict() | |
118 | - #for tag,count in relevant_count.items(): | |
119 | - # non_relevant_count[tag] = self.pxi.get_termfreq(tag)-count | |
120 | - # if non_relevant_count[tag]>0: | |
121 | - # rank[tag] = relevant_count[tag]/float(non_relevant_count[tag]) | |
122 | - #print "relevant",relevant_count | |
123 | - #print "non_relevant",non_relevant_count | |
124 | - #print sorted(rank.items(), key=operator.itemgetter(1)) | |
125 | - #[FIXME] get ths value based on real ranking | |
126 | - #print set(self.user.axi_tag_profile(self.pxi,4)) | |
127 | - self.assertEqual(set(self.user.axi_tag_profile(self.pxi,4)), | |
128 | - set(["XTuse::editing", "XTworks-with::image", | |
129 | - "XTworks-with-format::png", | |
130 | - "XTworks-with-format::jpg"])) | |
116 | + self.assertEqual(set(self.user.items()), | |
117 | + set(["gimp","aaphoto","eog","emacs"])) | |
118 | + | |
119 | + def test_profile(self): | |
120 | + self.assertEqual(self.user.profile(self.sample_axi,"tag",10), | |
121 | + self.user.tag_profile(self.sample_axi,10)) | |
122 | + self.assertEqual(self.user.profile(self.sample_axi,"desc",10), | |
123 | + self.user.desc_profile(self.sample_axi,10)) | |
124 | + self.assertEqual(self.user.profile(self.sample_axi,"full",10), | |
125 | + self.user.full_profile(self.sample_axi,10)) | |
126 | + | |
127 | + def test_tag_profile(self): | |
128 | + self.assertEqual(self.user.tag_profile(self.sample_axi,10), | |
129 | + ['XTuse::editing', 'XTworks-with::image:raster', | |
130 | + 'XTworks-with-format::png', 'XTworks-with-format::jpg', | |
131 | + 'XTworks-with::image','XTimplemented-in::c', | |
132 | + 'XTsuite::gnome', 'XTsuite::emacs', | |
133 | + 'XTrole::metapackage', 'XTdevel::editor']) | |
134 | + | |
135 | + def test_desc_profile(self): | |
136 | + self.assertEqual(self.user.desc_profile(self.sample_axi,10), | |
137 | + ['image', 'the', 'which', 'manipulation', 'program', | |
138 | + 'input', 'a', 'gnu', 'images', 'this']) | |
139 | + | |
140 | + def test_full_profile(self): | |
141 | + self.assertEqual(self.user.full_profile(self.sample_axi,10), | |
142 | + (self.user.tag_profile(self.sample_axi,5)+ | |
143 | + self.user.desc_profile(self.sample_axi,5))) | |
131 | 144 | |
132 | 145 | def test_maximal_pkg_profile(self): |
133 | 146 | old_pkg_profile = self.user.items() | ... | ... |
src/user.py
... | ... | @@ -25,6 +25,7 @@ import xapian |
25 | 25 | import logging |
26 | 26 | import apt |
27 | 27 | from singleton import Singleton |
28 | +import data | |
28 | 29 | |
29 | 30 | class FilterTag(xapian.ExpandDecider): |
30 | 31 | """ |
... | ... | @@ -34,7 +35,17 @@ class FilterTag(xapian.ExpandDecider): |
34 | 35 | """ |
35 | 36 | Return true if the term is a tag, else false. |
36 | 37 | """ |
37 | - return term[:2] == "XT" | |
38 | + return term.startswith("XT") | |
39 | + | |
40 | +class FilterDescription(xapian.ExpandDecider): | |
41 | + """ | |
42 | + Extend xapian.ExpandDecider to consider only package description terms. | |
43 | + """ | |
44 | + def __call__(self, term): | |
45 | + """ | |
46 | + Return true if the term is a tag, else false. | |
47 | + """ | |
48 | + return term.islower() #or term.startswith("Z") | |
38 | 49 | |
39 | 50 | class DemographicProfile(Singleton): |
40 | 51 | def __init__(self): |
... | ... | @@ -63,57 +74,83 @@ class User: |
63 | 74 | """ |
64 | 75 | Define a user of a recommender. |
65 | 76 | """ |
66 | - def __init__(self,item_score,user_id=0,profiles_set=0): | |
77 | + def __init__(self,item_score,user_id=0,demo_profiles_set=0): | |
67 | 78 | """ |
68 | - Set initial user attributes. If no user_id was passed as parameter, a | |
69 | - random md5-hash is generated for that purpose. If the demographic | |
70 | - profile was not defined, it defaults to 'desktop' | |
79 | + Set initial user attributes. pkg_profile gets the whole set of items, | |
80 | + a random user_id is set if none was provided and the demographic | |
81 | + profile defaults to 'desktop'. | |
71 | 82 | """ |
72 | 83 | self.item_score = item_score |
84 | + self.pkg_profile = self.items() | |
85 | + | |
73 | 86 | if user_id: |
74 | 87 | self.id = user_id |
75 | 88 | else: |
76 | 89 | random.seed() |
77 | 90 | self.id = random.getrandbits(128) |
78 | - self.pkg_profile = self.item_score.keys() | |
79 | - if not profiles_set: | |
91 | + | |
92 | + if not demo_profiles_set: | |
80 | 93 | profiles_set = set(["desktop"]) |
81 | 94 | self.set_demographic_profile(profiles_set) |
82 | 95 | |
96 | + def items(self): | |
97 | + """ | |
98 | + Return the set of user items. | |
99 | + """ | |
100 | + return self.item_score.keys() | |
101 | + | |
83 | 102 | def set_demographic_profile(self,profiles_set): |
103 | + """ | |
104 | + Set demographic profle based on labels in 'profiles_set'. | |
105 | + """ | |
84 | 106 | self.demographic_profile = DemographicProfile()(profiles_set) |
85 | 107 | |
86 | - def items(self): | |
108 | + def profile(self,items_repository,content,size): | |
87 | 109 | """ |
88 | - Return the set of user items. | |
110 | + Get user profile for a specific type of content: packages tags, | |
111 | + description or both (full_profile) | |
112 | + """ | |
113 | + if content == "tag": return self.tag_profile(items_repository,size) | |
114 | + if content == "desc": return self.desc_profile(items_repository,size) | |
115 | + if content == "full": return self.full_profile(items_repository,size) | |
116 | + | |
117 | + def tag_profile(self,items_repository,size): | |
118 | + """ | |
119 | + Return most relevant tags for a list of packages. | |
89 | 120 | """ |
90 | - return set(self.item_score.keys()) | |
91 | - | |
92 | - def axi_tag_profile(self,apt_xapian_index,profile_size): | |
93 | - """ | |
94 | - Return most relevant tags for a list of packages based on axi. | |
95 | - """ | |
96 | - terms = ["XP"+item for item in self.pkg_profile] | |
97 | - query = xapian.Query(xapian.Query.OP_OR, terms) | |
98 | - enquire = xapian.Enquire(apt_xapian_index) | |
99 | - enquire.set_query(query) | |
100 | - rset = xapian.RSet() | |
101 | - for m in enquire.get_mset(0,apt_xapian_index.get_doccount()): | |
102 | - rset.add_document(m.docid) | |
103 | - # statistically good differentiators between relevant and non-relevant | |
104 | - eset = enquire.get_eset(profile_size, rset, FilterTag()) | |
105 | - profile = [] | |
106 | - for res in eset: | |
107 | - profile.append(res.term) | |
108 | - logging.debug("%.2f %s" % (res.weight,res.term.lstrip("XT"))) | |
121 | + enquire = xapian.Enquire(items_repository) | |
122 | + matches = data.axi_search_pkgs(items_repository,self.pkg_profile) | |
123 | + rset_packages = xapian.RSet() | |
124 | + for m in matches: | |
125 | + rset_packages.add_document(m.docid) | |
126 | + # statistically good differentiators | |
127 | + eset_tags = enquire.get_eset(size, rset_packages, FilterTag()) | |
128 | + profile = [res.term for res in eset_tags] | |
109 | 129 | return profile |
110 | 130 | |
111 | - #def txi_tag_profile(self,tags_xapian_index,profile_size): | |
112 | - # """ | |
113 | - # Return most relevant tags for a list of packages based on tags index. | |
114 | - # """ | |
115 | - # return tags_xapian_index.relevant_tags_from_db(self.pkg_profile, | |
116 | - # profile_size) | |
131 | + def desc_profile(self,items_repository,size): | |
132 | + """ | |
133 | + Return most relevant keywords for a list of packages based on their | |
134 | + text descriptions. | |
135 | + """ | |
136 | + enquire = xapian.Enquire(items_repository) | |
137 | + matches = data.axi_search_pkgs(items_repository,self.pkg_profile) | |
138 | + rset_packages = xapian.RSet() | |
139 | + for m in matches: | |
140 | + rset_packages.add_document(m.docid) | |
141 | + eset_keywords = enquire.get_eset(size, rset_packages, | |
142 | + FilterDescription()) | |
143 | + profile = [res.term for res in eset_keywords] | |
144 | + return profile | |
145 | + | |
146 | + def full_profile(self,items_repository,size): | |
147 | + """ | |
148 | + Return most relevant tags and keywords for a list of packages based | |
149 | + their tags and descriptions. | |
150 | + """ | |
151 | + tag_profile = self.tag_profile(items_repository,size)[:size/2] | |
152 | + desc_profile = self.desc_profile(items_repository,size)[:size/2] | |
153 | + return tag_profile+desc_profile | |
117 | 154 | |
118 | 155 | def maximal_pkg_profile(self): |
119 | 156 | """ |
... | ... | @@ -132,12 +169,11 @@ class User: |
132 | 169 | if or_dep.name in self.pkg_profile: |
133 | 170 | self.pkg_profile.remove(or_dep.name) |
134 | 171 | except: |
135 | - logging.debug("Disconsidering package not found in cache: %s" | |
136 | - % p) | |
172 | + logging.debug("Package not found in cache: %s" % p) | |
137 | 173 | profile_size = len(self.pkg_profile) |
138 | - logging.info("Reduced packages profile size from %d to %d." % | |
139 | - (old_profile_size, profile_size)) | |
140 | - return set(self.pkg_profile) | |
174 | + logging.debug("Maximal package profile: reduced packages profile size \ | |
175 | + from %d to %d." % (old_profile_size, profile_size)) | |
176 | + return self.pkg_profile | |
141 | 177 | |
142 | 178 | class LocalSystem(User): |
143 | 179 | """ |
... | ... | @@ -168,8 +204,9 @@ class LocalSystem(User): |
168 | 204 | if pkg.is_auto_installed: |
169 | 205 | self.pkg_profile.remove(p) |
170 | 206 | except: |
171 | - logging.debug("Disconsidering package not found in cache: %s" | |
172 | - % p) | |
207 | + logging.debug("Package not found in cache: %s" % p) | |
173 | 208 | profile_size = len(self.pkg_profile) |
174 | - logging.info("Reduced packages profile size from %d to %d." % | |
175 | - (old_profile_size, profile_size)) | |
209 | + logging.debug("No auto-intalled package profile: reduced packages \ | |
210 | + profile size from %d to %d." % | |
211 | + (old_profile_size, profile_size)) | |
212 | + return self.pkg_profile | ... | ... |