Commit e6bf05b1c28a63af33232ce0457e665f04c831d0

Authored by Tássia Camões Araújo
1 parent 0b42f57e
Exists in master and in 1 other branch add_vagrant

Collaborative strategy implementation.

@@ -41,6 +41,8 @@ class Config(): @@ -41,6 +41,8 @@ class Config():
41 self.tags_index = "~/.app-recommender/debtags_index" 41 self.tags_index = "~/.app-recommender/debtags_index"
42 self.axi = "/var/lib/apt-xapian-index/index" 42 self.axi = "/var/lib/apt-xapian-index/index"
43 self.axi_values = "/var/lib/apt-xapian-index/values" 43 self.axi_values = "/var/lib/apt-xapian-index/values"
  44 + self.popcon_index = "~/.app-recommender/popcon_index"
  45 + self.popcon_dir = "~/.app-recommender/popcon_dir"
44 self.strategy = "ct" # defaults to the cheapest one 46 self.strategy = "ct" # defaults to the cheapest one
45 self.reindex = 0 47 self.reindex = 0
46 self.load_options() 48 self.load_options()
@@ -62,6 +64,8 @@ class Config(): @@ -62,6 +64,8 @@ class Config():
62 print " -i, --tagsindex=PATH Path to debtags dedicated index." 64 print " -i, --tagsindex=PATH Path to debtags dedicated index."
63 print " -r, --force-reindex Force reindexing debtags database." 65 print " -r, --force-reindex Force reindexing debtags database."
64 print " -a, --axi=PATH Path to Apt-xapian-index." 66 print " -a, --axi=PATH Path to Apt-xapian-index."
  67 + print " -p, --popconindex=PATH Path to popcon dedicated index."
  68 + print " -m, --popcondir=PATH Path to popcon submissions dir."
65 print " -s, --strategy=OPTION Recommendation strategy." 69 print " -s, --strategy=OPTION Recommendation strategy."
66 print "" 70 print ""
67 print " [ strategy options ] " 71 print " [ strategy options ] "
@@ -104,10 +108,13 @@ class Config(): @@ -104,10 +108,13 @@ class Config():
104 self.tags_index = self.read_option('recommender', 'tags_index') 108 self.tags_index = self.read_option('recommender', 'tags_index')
105 self.reindex = self.read_option('recommender', 'reindex') 109 self.reindex = self.read_option('recommender', 'reindex')
106 self.axi = self.read_option('recommender', 'axi') 110 self.axi = self.read_option('recommender', 'axi')
  111 + self.popcon_index = self.read_option('recommender', 'popcon_index')
  112 + self.popcon_dir = self.read_option('recommender', 'popcon_dir')
107 113
108 - short_options = "hdvo:c:t:i:ra:s:" 114 + short_options = "hdvo:c:t:i:ra:p:m:s:"
109 long_options = ["help", "debug", "verbose", "output=", "config=", 115 long_options = ["help", "debug", "verbose", "output=", "config=",
110 - "tagsdb=", "tagsindex=", "reindex", "axi=", "strategy="] 116 + "tagsdb=", "tagsindex=", "reindex", "axi=",
  117 + "popconindex=", "popcondir=", "strategy="]
111 try: 118 try:
112 opts, args = getopt.getopt(sys.argv[1:], short_options, 119 opts, args = getopt.getopt(sys.argv[1:], short_options,
113 long_options) 120 long_options)
@@ -138,6 +145,10 @@ class Config(): @@ -138,6 +145,10 @@ class Config():
138 elif o in ("-a", "--axi"): 145 elif o in ("-a", "--axi"):
139 self.axi = p + "/index" 146 self.axi = p + "/index"
140 self.axi_values = p + "/values" 147 self.axi_values = p + "/values"
  148 + elif o in ("-p", "--popconindex"):
  149 + self.popcon_index = p
  150 + elif o in ("-p", "--popcondir"):
  151 + self.popcon_dir = p
141 elif o in ("-s", "--strategy"): 152 elif o in ("-s", "--strategy"):
142 self.strategy = p 153 self.strategy = p
143 else: 154 else:
@@ -19,6 +19,7 @@ @@ -19,6 +19,7 @@
19 19
20 import os 20 import os
21 import sys 21 import sys
  22 +import gc
22 import re 23 import re
23 import xapian 24 import xapian
24 import axi 25 import axi
@@ -53,6 +54,21 @@ def normalize_tags(string): @@ -53,6 +54,21 @@ def normalize_tags(string):
53 """ 54 """
54 return string.replace(':','_').replace('-','\'') 55 return string.replace(':','_').replace('-','\'')
55 56
  57 +def load_debtags_db(db_path):
  58 + """
  59 + Load debtags database from the source file.
  60 + """
  61 + tag_filter = re.compile(r"^special::.+$|^.+::TODO$")
  62 + try:
  63 + db_file = open(db_path, "r")
  64 + debtags_db = debtags.DB()
  65 + debtags_db.read(db_file,lambda x: not tag_filter.match(x))
  66 + db_file.close()
  67 + return debtags_db
  68 + except:
  69 + logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
  70 + raise Error
  71 +
56 class TagsXapianIndex(xapian.WritableDatabase,Singleton): 72 class TagsXapianIndex(xapian.WritableDatabase,Singleton):
57 """ 73 """
58 Data source for tags info defined as a singleton xapian database. 74 Data source for tags info defined as a singleton xapian database.
@@ -76,25 +92,25 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): @@ -76,25 +92,25 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
76 db_file.close() 92 db_file.close()
77 self.load_index(cfg.reindex) 93 self.load_index(cfg.reindex)
78 94
79 - def load_db(self):  
80 - """  
81 - Load debtags database from the source file.  
82 - """  
83 - tag_filter = re.compile(r"^special::.+$|^.+::TODO$")  
84 - try:  
85 - db_file = open(self.db_path, "r")  
86 - self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))  
87 - db_file.close()  
88 - except:  
89 - logging.error("Could not load DebtagsDB from '%s'." % self.db_path)  
90 - raise Error 95 +# def load_db(self):
  96 +# """
  97 +# Load debtags database from the source file.
  98 +# """
  99 +# tag_filter = re.compile(r"^special::.+$|^.+::TODO$")
  100 +# try:
  101 +# db_file = open(self.db_path, "r")
  102 +# self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))
  103 +# db_file.close()
  104 +# except:
  105 +# logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
  106 +# raise Error
91 107
92 def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): 108 def relevant_tags_from_db(self,pkgs_list,qtd_of_tags):
93 """ 109 """
94 Return most relevant tags considering a list of packages. 110 Return most relevant tags considering a list of packages.
95 """ 111 """
96 if not self.debtags_db.package_count(): 112 if not self.debtags_db.package_count():
97 - self.load_db() 113 + self.debtags_db = load_debtags_db(self.db_path)
98 relevant_db = self.debtags_db.choose_packages(pkgs_list) 114 relevant_db = self.debtags_db.choose_packages(pkgs_list)
99 relevance_index = debtags.relevance_index_function(self.debtags_db, 115 relevance_index = debtags.relevance_index_function(self.debtags_db,
100 relevant_db) 116 relevant_db)
@@ -117,7 +133,7 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): @@ -117,7 +133,7 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
117 logging.info("Index must be updated.") 133 logging.info("Index must be updated.")
118 reindex = 1 134 reindex = 1
119 except xapian.DatabaseError: 135 except xapian.DatabaseError:
120 - logging.info("Could not open index.") 136 + logging.info("Could not open debtags index.")
121 reindex =1 137 reindex =1
122 138
123 if reindex: 139 if reindex:
@@ -126,13 +142,15 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): @@ -126,13 +142,15 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
126 def new_index(self): 142 def new_index(self):
127 """ 143 """
128 Create a xapian index for debtags info based on 'debtags_db' and 144 Create a xapian index for debtags info based on 'debtags_db' and
129 - place it at 'index_path'. 145 + place it at 'self.path'.
130 """ 146 """
131 if not os.path.exists(self.path): 147 if not os.path.exists(self.path):
132 os.makedirs(self.path) 148 os.makedirs(self.path)
133 149
134 try: 150 try:
135 - logging.info("Creating new xapian index for debtags at \'%s\'" % 151 + logging.info("Indexing debtags info from \'%s\'" %
  152 + self.db_path)
  153 + logging.info("Creating new xapian index at \'%s\'" %
136 self.path) 154 self.path)
137 xapian.WritableDatabase.__init__(self,self.path, 155 xapian.WritableDatabase.__init__(self,self.path,
138 xapian.DB_CREATE_OR_OVERWRITE) 156 xapian.DB_CREATE_OR_OVERWRITE)
@@ -140,7 +158,7 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): @@ -140,7 +158,7 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
140 logging.critical("Could not create xapian index.") 158 logging.critical("Could not create xapian index.")
141 raise Error 159 raise Error
142 160
143 - self.load_db() 161 + self.debtags_db = load_debtags_db(self.db_path)
144 self.set_metadata("md5",self.db_md5) 162 self.set_metadata("md5",self.db_md5)
145 163
146 for pkg,tags in self.debtags_db.iter_packages_tags(): 164 for pkg,tags in self.debtags_db.iter_packages_tags():
@@ -149,4 +167,94 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): @@ -149,4 +167,94 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
149 for tag in tags: 167 for tag in tags:
150 doc.add_term(normalize_tags(tag)) 168 doc.add_term(normalize_tags(tag))
151 doc_id = self.add_document(doc) 169 doc_id = self.add_document(doc)
152 - logging.debug("Indexing doc %d",doc_id) 170 + logging.debug("Debtags Xapian: Indexing doc %d",doc_id)
  171 +
  172 +class PopconXapianIndex(xapian.WritableDatabase,Singleton):
  173 + """
  174 + Data source for popcon submissions defined as a singleton xapian database.
  175 + """
  176 + def __init__(self,cfg):
  177 + """
  178 + Set initial attributes.
  179 + """
  180 + self.path = os.path.expanduser(cfg.popcon_index)
  181 + self.popcon_dir = os.path.expanduser(cfg.popcon_dir)
  182 + self.debtags_path = os.path.expanduser(cfg.tags_db)
  183 + self.load_index()
  184 +
  185 + def parse_submission(self,submission_path,binary=1):
  186 + """
  187 + Parse a popcon submission, generating the names of the valid packages
  188 + in the vote.
  189 + """
  190 + submission = open(submission_path)
  191 + for line in submission:
  192 + if not line.startswith("POPULARITY"):
  193 + if not line.startswith("END-POPULARITY"):
  194 + data = line[:-1].split(" ")
  195 + if len(data) > 3:
  196 + if binary:
  197 + # every installed package has the same weight
  198 + yield data[2], 1
  199 + elif data[3] == '<NOFILES>':
  200 + # No executable files to track
  201 + yield data[2], 1
  202 + elif len(data) == 4:
  203 + # Recently used packages
  204 + yield data[2], 10
  205 + elif data[4] == '<OLD>':
  206 + # Unused packages
  207 + yield data[2], 3
  208 + elif data[4] == '<RECENT-CTIME>':
  209 + # Recently installed packages
  210 + yield data[2], 8
  211 +
  212 + def load_index(self):
  213 + """
  214 + Load an existing popcon index.
  215 + """
  216 + try:
  217 + logging.info("Opening existing popcon xapian index at \'%s\'"
  218 + % self.path)
  219 + xapian.Database.__init__(self,self.path)
  220 + except xapian.DatabaseError:
  221 + logging.info("Could not open popcon index.")
  222 + self.new_index()
  223 +
  224 + def new_index(self):
  225 + """
  226 + Create a xapian index for popcon submissions at 'popcon_dir' and
  227 + place it at 'self.path'.
  228 + """
  229 + if not os.path.exists(self.path):
  230 + os.makedirs(self.path)
  231 + debtags_db = load_debtags_db(self.debtags_path)
  232 +
  233 + try:
  234 + logging.info("Indexing popcon submissions from \'%s\'" %
  235 + self.popcon_dir)
  236 + logging.info("Creating new xapian index at \'%s\'" %
  237 + self.path)
  238 + xapian.WritableDatabase.__init__(self,self.path,
  239 + xapian.DB_CREATE_OR_OVERWRITE)
  240 + except xapian.DatabaseError:
  241 + logging.critical("Could not create popcon xapian index.")
  242 + raise Error
  243 +
  244 + for root, dirs, files in os.walk(self.popcon_dir):
  245 + for submission in files:
  246 + submission_path = os.path.join(root, submission)
  247 + doc = xapian.Document()
  248 + doc.set_data(submission)
  249 + logging.debug("Parsing popcon submission at \'%s\'" %
  250 + submission_path)
  251 + for pkg, freq in self.parse_submission(submission_path):
  252 + doc.add_term(pkg,freq)
  253 + for tag in debtags_db.tags_of_package(pkg):
  254 + doc.add_term("XT"+tag,freq)
  255 + doc_id = self.add_document(doc)
  256 + logging.debug("Popcon Xapian: Indexing doc %d" % doc_id)
  257 + # python garbage collector
  258 + gc.collect()
  259 + # flush to disk database changes
  260 + self.flush()
src/recommender.py
@@ -83,6 +83,14 @@ class Recommender: @@ -83,6 +83,14 @@ class Recommender:
83 self.items_repository = xapian.Database(cfg.axi) 83 self.items_repository = xapian.Database(cfg.axi)
84 self.strategy = AxiContentBasedStrategy() 84 self.strategy = AxiContentBasedStrategy()
85 85
  86 + def col(self,cfg):
  87 + """
  88 + Set recommender attributes to perform collaborative recommendation
  89 + using popcon-xapian-index as source data.
  90 + """
  91 + self.users_repository = PopconXapianIndex(cfg)
  92 + self.strategy = CollaborativeStrategy()
  93 +
86 def set_strategy(self,strategy): 94 def set_strategy(self,strategy):
87 """ 95 """
88 Set the recommendation strategy. 96 Set the recommendation strategy.
src/strategy.py
@@ -48,7 +48,6 @@ class PopularityHeuristic(ReputationHeuristic): @@ -48,7 +48,6 @@ class PopularityHeuristic(ReputationHeuristic):
48 """ 48 """
49 pass 49 pass
50 50
51 -  
52 class PkgMatchDecider(xapian.MatchDecider): 51 class PkgMatchDecider(xapian.MatchDecider):
53 """ 52 """
54 Extend xapian.MatchDecider to not consider installed packages. 53 Extend xapian.MatchDecider to not consider installed packages.
@@ -67,6 +66,64 @@ class PkgMatchDecider(xapian.MatchDecider): @@ -67,6 +66,64 @@ class PkgMatchDecider(xapian.MatchDecider):
67 """ 66 """
68 return doc.get_data() not in self.installed_pkgs 67 return doc.get_data() not in self.installed_pkgs
69 68
  69 +class UserMatchDecider(xapian.MatchDecider):
  70 + """
  71 + Extend xapian.MatchDecider to match similar profiles.
  72 + """
  73 +
  74 + def __init__(self, profile):
  75 + """
  76 + Set initial parameters.
  77 + """
  78 + xapian.MatchDecider.__init__(self)
  79 + self.profile = profile
  80 + print "mdecider:",profile
  81 +
  82 + def __call__(self, doc):
  83 + """
  84 + True if the user has more the half of packages from profile.
  85 + """
  86 + profile_size = len(self.profile)
  87 + pkg_match=0
  88 + for term in doc:
  89 + if term.term in self.profile:
  90 + pkg_match = pkg_match+1
  91 + print "id",doc.get_docid(),"match",pkg_match
  92 + return pkg_match >= profile_size/2
  93 +
  94 +class PkgExpandDecider(xapian.ExpandDecider):
  95 + """
  96 + Extend xapian.ExpandDecider to consider packages only.
  97 + """
  98 +
  99 + def __init__(self):
  100 + """
  101 + Call base class init.
  102 + """
  103 + xapian.ExpandDecider.__init__(self)
  104 +
  105 + def __call__(self, term):
  106 + """
  107 + True if the term is a package.
  108 + """
  109 + return not term.startswith("XT")
  110 +
  111 +class TagExpandDecider(xapian.ExpandDecider):
  112 + """
  113 + Extend xapian.ExpandDecider to consider tags only.
  114 + """
  115 +
  116 + def __init__(self, profile):
  117 + """
  118 + Call base class init.
  119 + """
  120 + xapian.ExpandDecider.__init__(self)
  121 +
  122 + def __call__(self, doc):
  123 + """
  124 + True if the user has more the half of packages from profile.
  125 + """
  126 + return term.startswith("XT")
70 127
71 class RecommendationStrategy: 128 class RecommendationStrategy:
72 """ 129 """
@@ -82,7 +139,8 @@ class ItemReputationStrategy(RecommendationStrategy): @@ -82,7 +139,8 @@ class ItemReputationStrategy(RecommendationStrategy):
82 """ 139 """
83 Perform recommendation strategy. 140 Perform recommendation strategy.
84 """ 141 """
85 - return RecomendationResult() 142 + logging.critical("Item reputation recommendation strategy is not yet implemented.")
  143 + raise Error
86 144
87 class ContentBasedStrategy(RecommendationStrategy): 145 class ContentBasedStrategy(RecommendationStrategy):
88 """ 146 """
@@ -133,15 +191,41 @@ class AxiContentBasedStrategy(RecommendationStrategy): @@ -133,15 +191,41 @@ class AxiContentBasedStrategy(RecommendationStrategy):
133 item_score[m.document.get_data()] = m.rank 191 item_score[m.document.get_data()] = m.rank
134 return recommender.RecommendationResult(item_score,20) 192 return recommender.RecommendationResult(item_score,20)
135 193
136 -class ColaborativeStrategy(RecommendationStrategy): 194 +class CollaborativeStrategy(RecommendationStrategy):
137 """ 195 """
138 Colaborative recommendation strategy. 196 Colaborative recommendation strategy.
139 """ 197 """
140 - def run(self,user,users_repository,similarity_measure): 198 + #def run(self,rec,user,similarity_measure):
  199 + def run(self,rec,user):
141 """ 200 """
142 Perform recommendation strategy. 201 Perform recommendation strategy.
143 """ 202 """
144 - return RecomendationResult() 203 + profile = user.maximal_pkg_profile()
  204 + query = xapian.Query(xapian.Query.OP_OR,profile)
  205 + enquire = xapian.Enquire(rec.users_repository)
  206 + enquire.set_query(query)
  207 +
  208 + try:
  209 + #mset = enquire.get_mset(0, 182, None, UserMatchDecider(profile))
  210 + mset = enquire.get_mset(0, 20)
  211 + except xapian.DatabaseError as error:
  212 + logging.critical(error.get_msg())
  213 + raise Error
  214 +
  215 + rset = xapian.RSet()
  216 + for m in mset:
  217 + rset.add_document(m.document.get_docid())
  218 + logging.debug("Counting as relevant submission %s" %
  219 + m.document.get_data())
  220 +
  221 + eset = enquire.get_eset(20,rset,PkgExpandDecider())
  222 + rank = 0
  223 + item_score = {}
  224 + for term in eset:
  225 + item_score[term.term] = rank
  226 + rank = rank+1
  227 +
  228 + return recommender.RecommendationResult(item_score,20)
145 229
146 class KnowledgeBasedStrategy(RecommendationStrategy): 230 class KnowledgeBasedStrategy(RecommendationStrategy):
147 """ 231 """
@@ -151,7 +235,8 @@ class KnowledgeBasedStrategy(RecommendationStrategy): @@ -151,7 +235,8 @@ class KnowledgeBasedStrategy(RecommendationStrategy):
151 """ 235 """
152 Perform recommendation strategy. 236 Perform recommendation strategy.
153 """ 237 """
154 - return RecomendationResult() 238 + logging.critical("Knowledge-based recommendation strategy is not yet implemented.")
  239 + raise Error
155 240
156 class DemographicStrategy(RecommendationStrategy): 241 class DemographicStrategy(RecommendationStrategy):
157 """ 242 """
@@ -161,4 +246,5 @@ class DemographicStrategy(RecommendationStrategy): @@ -161,4 +246,5 @@ class DemographicStrategy(RecommendationStrategy):
161 """ 246 """
162 Perform recommendation strategy. 247 Perform recommendation strategy.
163 """ 248 """
164 - return RecomendationResult() 249 + logging.critical("Demographic recommendation strategy is not yet implemented.")
  250 + raise Error
@@ -95,6 +95,7 @@ class User: @@ -95,6 +95,7 @@ class User:
95 profile_size = len(self.pkg_profile) 95 profile_size = len(self.pkg_profile)
96 logging.info("Reduced packages profile size from %d to %d." % 96 logging.info("Reduced packages profile size from %d to %d." %
97 (old_profile_size, profile_size)) 97 (old_profile_size, profile_size))
  98 + return self.pkg_profile
98 99
99 class LocalSystem(User): 100 class LocalSystem(User):
100 """ 101 """