Commit e6bf05b1c28a63af33232ce0457e665f04c831d0

Authored by Tássia Camões Araújo
1 parent 0b42f57e
Exists in master and in 1 other branch add_vagrant

Collaborative strategy implementation.

src/config.py
... ... @@ -41,6 +41,8 @@ class Config():
41 41 self.tags_index = "~/.app-recommender/debtags_index"
42 42 self.axi = "/var/lib/apt-xapian-index/index"
43 43 self.axi_values = "/var/lib/apt-xapian-index/values"
  44 + self.popcon_index = "~/.app-recommender/popcon_index"
  45 + self.popcon_dir = "~/.app-recommender/popcon_dir"
44 46 self.strategy = "ct" # defaults to the cheapest one
45 47 self.reindex = 0
46 48 self.load_options()
... ... @@ -62,6 +64,8 @@ class Config():
62 64 print " -i, --tagsindex=PATH Path to debtags dedicated index."
63 65 print " -r, --force-reindex Force reindexing debtags database."
64 66 print " -a, --axi=PATH Path to Apt-xapian-index."
  67 + print " -p, --popconindex=PATH Path to popcon dedicated index."
  68 + print " -m, --popcondir=PATH Path to popcon submissions dir."
65 69 print " -s, --strategy=OPTION Recommendation strategy."
66 70 print ""
67 71 print " [ strategy options ] "
... ... @@ -104,10 +108,13 @@ class Config():
104 108 self.tags_index = self.read_option('recommender', 'tags_index')
105 109 self.reindex = self.read_option('recommender', 'reindex')
106 110 self.axi = self.read_option('recommender', 'axi')
  111 + self.popcon_index = self.read_option('recommender', 'popcon_index')
  112 + self.popcon_dir = self.read_option('recommender', 'popcon_dir')
107 113  
108   - short_options = "hdvo:c:t:i:ra:s:"
  114 + short_options = "hdvo:c:t:i:ra:p:m:s:"
109 115 long_options = ["help", "debug", "verbose", "output=", "config=",
110   - "tagsdb=", "tagsindex=", "reindex", "axi=", "strategy="]
  116 + "tagsdb=", "tagsindex=", "reindex", "axi=",
  117 + "popconindex=", "popcondir=", "strategy="]
111 118 try:
112 119 opts, args = getopt.getopt(sys.argv[1:], short_options,
113 120 long_options)
... ... @@ -138,6 +145,10 @@ class Config():
138 145 elif o in ("-a", "--axi"):
139 146 self.axi = p + "/index"
140 147 self.axi_values = p + "/values"
  148 + elif o in ("-p", "--popconindex"):
  149 + self.popcon_index = p
  150 + elif o in ("-p", "--popcondir"):
  151 + self.popcon_dir = p
141 152 elif o in ("-s", "--strategy"):
142 153 self.strategy = p
143 154 else:
... ...
src/data.py
... ... @@ -19,6 +19,7 @@
19 19  
20 20 import os
21 21 import sys
  22 +import gc
22 23 import re
23 24 import xapian
24 25 import axi
... ... @@ -53,6 +54,21 @@ def normalize_tags(string):
53 54 """
54 55 return string.replace(':','_').replace('-','\'')
55 56  
  57 +def load_debtags_db(db_path):
  58 + """
  59 + Load debtags database from the source file.
  60 + """
  61 + tag_filter = re.compile(r"^special::.+$|^.+::TODO$")
  62 + try:
  63 + db_file = open(db_path, "r")
  64 + debtags_db = debtags.DB()
  65 + debtags_db.read(db_file,lambda x: not tag_filter.match(x))
  66 + db_file.close()
  67 + return debtags_db
  68 + except:
  69 + logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
  70 + raise Error
  71 +
56 72 class TagsXapianIndex(xapian.WritableDatabase,Singleton):
57 73 """
58 74 Data source for tags info defined as a singleton xapian database.
... ... @@ -76,25 +92,25 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
76 92 db_file.close()
77 93 self.load_index(cfg.reindex)
78 94  
79   - def load_db(self):
80   - """
81   - Load debtags database from the source file.
82   - """
83   - tag_filter = re.compile(r"^special::.+$|^.+::TODO$")
84   - try:
85   - db_file = open(self.db_path, "r")
86   - self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))
87   - db_file.close()
88   - except:
89   - logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
90   - raise Error
  95 +# def load_db(self):
  96 +# """
  97 +# Load debtags database from the source file.
  98 +# """
  99 +# tag_filter = re.compile(r"^special::.+$|^.+::TODO$")
  100 +# try:
  101 +# db_file = open(self.db_path, "r")
  102 +# self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))
  103 +# db_file.close()
  104 +# except:
  105 +# logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
  106 +# raise Error
91 107  
92 108 def relevant_tags_from_db(self,pkgs_list,qtd_of_tags):
93 109 """
94 110 Return most relevant tags considering a list of packages.
95 111 """
96 112 if not self.debtags_db.package_count():
97   - self.load_db()
  113 + self.debtags_db = load_debtags_db(self.db_path)
98 114 relevant_db = self.debtags_db.choose_packages(pkgs_list)
99 115 relevance_index = debtags.relevance_index_function(self.debtags_db,
100 116 relevant_db)
... ... @@ -117,7 +133,7 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
117 133 logging.info("Index must be updated.")
118 134 reindex = 1
119 135 except xapian.DatabaseError:
120   - logging.info("Could not open index.")
  136 + logging.info("Could not open debtags index.")
121 137 reindex =1
122 138  
123 139 if reindex:
... ... @@ -126,13 +142,15 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
126 142 def new_index(self):
127 143 """
128 144 Create a xapian index for debtags info based on 'debtags_db' and
129   - place it at 'index_path'.
  145 + place it at 'self.path'.
130 146 """
131 147 if not os.path.exists(self.path):
132 148 os.makedirs(self.path)
133 149  
134 150 try:
135   - logging.info("Creating new xapian index for debtags at \'%s\'" %
  151 + logging.info("Indexing debtags info from \'%s\'" %
  152 + self.db_path)
  153 + logging.info("Creating new xapian index at \'%s\'" %
136 154 self.path)
137 155 xapian.WritableDatabase.__init__(self,self.path,
138 156 xapian.DB_CREATE_OR_OVERWRITE)
... ... @@ -140,7 +158,7 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
140 158 logging.critical("Could not create xapian index.")
141 159 raise Error
142 160  
143   - self.load_db()
  161 + self.debtags_db = load_debtags_db(self.db_path)
144 162 self.set_metadata("md5",self.db_md5)
145 163  
146 164 for pkg,tags in self.debtags_db.iter_packages_tags():
... ... @@ -149,4 +167,94 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
149 167 for tag in tags:
150 168 doc.add_term(normalize_tags(tag))
151 169 doc_id = self.add_document(doc)
152   - logging.debug("Indexing doc %d",doc_id)
  170 + logging.debug("Debtags Xapian: Indexing doc %d",doc_id)
  171 +
  172 +class PopconXapianIndex(xapian.WritableDatabase,Singleton):
  173 + """
  174 + Data source for popcon submissions defined as a singleton xapian database.
  175 + """
  176 + def __init__(self,cfg):
  177 + """
  178 + Set initial attributes.
  179 + """
  180 + self.path = os.path.expanduser(cfg.popcon_index)
  181 + self.popcon_dir = os.path.expanduser(cfg.popcon_dir)
  182 + self.debtags_path = os.path.expanduser(cfg.tags_db)
  183 + self.load_index()
  184 +
  185 + def parse_submission(self,submission_path,binary=1):
  186 + """
  187 + Parse a popcon submission, generating the names of the valid packages
  188 + in the vote.
  189 + """
  190 + submission = open(submission_path)
  191 + for line in submission:
  192 + if not line.startswith("POPULARITY"):
  193 + if not line.startswith("END-POPULARITY"):
  194 + data = line[:-1].split(" ")
  195 + if len(data) > 3:
  196 + if binary:
  197 + # every installed package has the same weight
  198 + yield data[2], 1
  199 + elif data[3] == '<NOFILES>':
  200 + # No executable files to track
  201 + yield data[2], 1
  202 + elif len(data) == 4:
  203 + # Recently used packages
  204 + yield data[2], 10
  205 + elif data[4] == '<OLD>':
  206 + # Unused packages
  207 + yield data[2], 3
  208 + elif data[4] == '<RECENT-CTIME>':
  209 + # Recently installed packages
  210 + yield data[2], 8
  211 +
  212 + def load_index(self):
  213 + """
  214 + Load an existing popcon index.
  215 + """
  216 + try:
  217 + logging.info("Opening existing popcon xapian index at \'%s\'"
  218 + % self.path)
  219 + xapian.Database.__init__(self,self.path)
  220 + except xapian.DatabaseError:
  221 + logging.info("Could not open popcon index.")
  222 + self.new_index()
  223 +
  224 + def new_index(self):
  225 + """
  226 + Create a xapian index for popcon submissions at 'popcon_dir' and
  227 + place it at 'self.path'.
  228 + """
  229 + if not os.path.exists(self.path):
  230 + os.makedirs(self.path)
  231 + debtags_db = load_debtags_db(self.debtags_path)
  232 +
  233 + try:
  234 + logging.info("Indexing popcon submissions from \'%s\'" %
  235 + self.popcon_dir)
  236 + logging.info("Creating new xapian index at \'%s\'" %
  237 + self.path)
  238 + xapian.WritableDatabase.__init__(self,self.path,
  239 + xapian.DB_CREATE_OR_OVERWRITE)
  240 + except xapian.DatabaseError:
  241 + logging.critical("Could not create popcon xapian index.")
  242 + raise Error
  243 +
  244 + for root, dirs, files in os.walk(self.popcon_dir):
  245 + for submission in files:
  246 + submission_path = os.path.join(root, submission)
  247 + doc = xapian.Document()
  248 + doc.set_data(submission)
  249 + logging.debug("Parsing popcon submission at \'%s\'" %
  250 + submission_path)
  251 + for pkg, freq in self.parse_submission(submission_path):
  252 + doc.add_term(pkg,freq)
  253 + for tag in debtags_db.tags_of_package(pkg):
  254 + doc.add_term("XT"+tag,freq)
  255 + doc_id = self.add_document(doc)
  256 + logging.debug("Popcon Xapian: Indexing doc %d" % doc_id)
  257 + # python garbage collector
  258 + gc.collect()
  259 + # flush to disk database changes
  260 + self.flush()
... ...
src/recommender.py
... ... @@ -83,6 +83,14 @@ class Recommender:
83 83 self.items_repository = xapian.Database(cfg.axi)
84 84 self.strategy = AxiContentBasedStrategy()
85 85  
  86 + def col(self,cfg):
  87 + """
  88 + Set recommender attributes to perform collaborative recommendation
  89 + using popcon-xapian-index as source data.
  90 + """
  91 + self.users_repository = PopconXapianIndex(cfg)
  92 + self.strategy = CollaborativeStrategy()
  93 +
86 94 def set_strategy(self,strategy):
87 95 """
88 96 Set the recommendation strategy.
... ...
src/strategy.py
... ... @@ -48,7 +48,6 @@ class PopularityHeuristic(ReputationHeuristic):
48 48 """
49 49 pass
50 50  
51   -
52 51 class PkgMatchDecider(xapian.MatchDecider):
53 52 """
54 53 Extend xapian.MatchDecider to not consider installed packages.
... ... @@ -67,6 +66,64 @@ class PkgMatchDecider(xapian.MatchDecider):
67 66 """
68 67 return doc.get_data() not in self.installed_pkgs
69 68  
  69 +class UserMatchDecider(xapian.MatchDecider):
  70 + """
  71 + Extend xapian.MatchDecider to match similar profiles.
  72 + """
  73 +
  74 + def __init__(self, profile):
  75 + """
  76 + Set initial parameters.
  77 + """
  78 + xapian.MatchDecider.__init__(self)
  79 + self.profile = profile
  80 + print "mdecider:",profile
  81 +
  82 + def __call__(self, doc):
  83 + """
  84 + True if the user has more the half of packages from profile.
  85 + """
  86 + profile_size = len(self.profile)
  87 + pkg_match=0
  88 + for term in doc:
  89 + if term.term in self.profile:
  90 + pkg_match = pkg_match+1
  91 + print "id",doc.get_docid(),"match",pkg_match
  92 + return pkg_match >= profile_size/2
  93 +
  94 +class PkgExpandDecider(xapian.ExpandDecider):
  95 + """
  96 + Extend xapian.ExpandDecider to consider packages only.
  97 + """
  98 +
  99 + def __init__(self):
  100 + """
  101 + Call base class init.
  102 + """
  103 + xapian.ExpandDecider.__init__(self)
  104 +
  105 + def __call__(self, term):
  106 + """
  107 + True if the term is a package.
  108 + """
  109 + return not term.startswith("XT")
  110 +
  111 +class TagExpandDecider(xapian.ExpandDecider):
  112 + """
  113 + Extend xapian.ExpandDecider to consider tags only.
  114 + """
  115 +
  116 + def __init__(self, profile):
  117 + """
  118 + Call base class init.
  119 + """
  120 + xapian.ExpandDecider.__init__(self)
  121 +
  122 + def __call__(self, doc):
  123 + """
  124 + True if the user has more the half of packages from profile.
  125 + """
  126 + return term.startswith("XT")
70 127  
71 128 class RecommendationStrategy:
72 129 """
... ... @@ -82,7 +139,8 @@ class ItemReputationStrategy(RecommendationStrategy):
82 139 """
83 140 Perform recommendation strategy.
84 141 """
85   - return RecomendationResult()
  142 + logging.critical("Item reputation recommendation strategy is not yet implemented.")
  143 + raise Error
86 144  
87 145 class ContentBasedStrategy(RecommendationStrategy):
88 146 """
... ... @@ -133,15 +191,41 @@ class AxiContentBasedStrategy(RecommendationStrategy):
133 191 item_score[m.document.get_data()] = m.rank
134 192 return recommender.RecommendationResult(item_score,20)
135 193  
136   -class ColaborativeStrategy(RecommendationStrategy):
  194 +class CollaborativeStrategy(RecommendationStrategy):
137 195 """
138 196 Colaborative recommendation strategy.
139 197 """
140   - def run(self,user,users_repository,similarity_measure):
  198 + #def run(self,rec,user,similarity_measure):
  199 + def run(self,rec,user):
141 200 """
142 201 Perform recommendation strategy.
143 202 """
144   - return RecomendationResult()
  203 + profile = user.maximal_pkg_profile()
  204 + query = xapian.Query(xapian.Query.OP_OR,profile)
  205 + enquire = xapian.Enquire(rec.users_repository)
  206 + enquire.set_query(query)
  207 +
  208 + try:
  209 + #mset = enquire.get_mset(0, 182, None, UserMatchDecider(profile))
  210 + mset = enquire.get_mset(0, 20)
  211 + except xapian.DatabaseError as error:
  212 + logging.critical(error.get_msg())
  213 + raise Error
  214 +
  215 + rset = xapian.RSet()
  216 + for m in mset:
  217 + rset.add_document(m.document.get_docid())
  218 + logging.debug("Counting as relevant submission %s" %
  219 + m.document.get_data())
  220 +
  221 + eset = enquire.get_eset(20,rset,PkgExpandDecider())
  222 + rank = 0
  223 + item_score = {}
  224 + for term in eset:
  225 + item_score[term.term] = rank
  226 + rank = rank+1
  227 +
  228 + return recommender.RecommendationResult(item_score,20)
145 229  
146 230 class KnowledgeBasedStrategy(RecommendationStrategy):
147 231 """
... ... @@ -151,7 +235,8 @@ class KnowledgeBasedStrategy(RecommendationStrategy):
151 235 """
152 236 Perform recommendation strategy.
153 237 """
154   - return RecomendationResult()
  238 + logging.critical("Knowledge-based recommendation strategy is not yet implemented.")
  239 + raise Error
155 240  
156 241 class DemographicStrategy(RecommendationStrategy):
157 242 """
... ... @@ -161,4 +246,5 @@ class DemographicStrategy(RecommendationStrategy):
161 246 """
162 247 Perform recommendation strategy.
163 248 """
164   - return RecomendationResult()
  249 + logging.critical("Demographic recommendation strategy is not yet implemented.")
  250 + raise Error
... ...
src/user.py
... ... @@ -95,6 +95,7 @@ class User:
95 95 profile_size = len(self.pkg_profile)
96 96 logging.info("Reduced packages profile size from %d to %d." %
97 97 (old_profile_size, profile_size))
  98 + return self.pkg_profile
98 99  
99 100 class LocalSystem(User):
100 101 """
... ...