Commit 11dce5d56c8d60e16748c35ba01a9cd28b7504b2
1 parent
8859fa78
Exists in
master
and in
1 other branch
Code refactoring and amenities
- Merged DebtagsDB and DebtagsIndex into TagsXapianIndex - Moved recommender setup to class initialization - Handling errors with try, exception and raise statements - Saving and logging computation time
Showing
9 changed files
with
98 additions
and
77 deletions
Show diff stats
src/app_recommender.py
| @@ -20,6 +20,8 @@ | @@ -20,6 +20,8 @@ | ||
| 20 | import os | 20 | import os |
| 21 | import sys | 21 | import sys |
| 22 | import logging | 22 | import logging |
| 23 | +import datetime | ||
| 24 | +from datetime import timedelta | ||
| 23 | 25 | ||
| 24 | from config import * | 26 | from config import * |
| 25 | from data import * | 27 | from data import * |
| @@ -28,27 +30,24 @@ from similarity_measure import * | @@ -28,27 +30,24 @@ from similarity_measure import * | ||
| 28 | from recommender import * | 30 | from recommender import * |
| 29 | from strategy import * | 31 | from strategy import * |
| 30 | from user import * | 32 | from user import * |
| 33 | +from error import Error | ||
| 31 | 34 | ||
| 32 | -def set_up_recommender(cfg): | ||
| 33 | - if cfg.strategy == "cta": | ||
| 34 | - axi_db = xapian.Database(cfg.axi) | ||
| 35 | - app_rec = Recommender(axi_db) | ||
| 36 | - app_rec.set_strategy(AxiContentBasedStrategy()) | 35 | +if __name__ == '__main__': |
| 36 | + try: | ||
| 37 | + cfg = Config() | ||
| 38 | + rec = Recommender(cfg) | ||
| 39 | + user = LocalSystem() | ||
| 37 | 40 | ||
| 38 | - elif cfg.strategy == "ct": | ||
| 39 | - debtags_db = DebtagsDB(cfg.tags_db) | ||
| 40 | - if not debtags_db.load(): | ||
| 41 | - logging.error("Could not load DebtagsDB from %s." % cfg.tags_db) | ||
| 42 | - sys.exit(1) | ||
| 43 | - debtags_index = DebtagsIndex(os.path.expanduser(cfg.tags_index)) | ||
| 44 | - debtags_index.load(debtags_db,cfg.reindex) | ||
| 45 | - app_rec = Recommender(debtags_index) | ||
| 46 | - app_rec.set_strategy(ContentBasedStrategy()) | 41 | + begin_time = datetime.datetime.now() |
| 42 | + logging.debug("Recommendation computation started at %s" % begin_time) | ||
| 47 | 43 | ||
| 48 | - return app_rec | 44 | + print rec.get_recommendation(user) |
| 45 | + | ||
| 46 | + end_time = datetime.datetime.now() | ||
| 47 | + logging.debug("Recommendation computation completed at %s" % end_time) | ||
| 48 | + delta = end_time - begin_time | ||
| 49 | + logging.info("Time elapsed: %d seconds." % delta.seconds) | ||
| 50 | + | ||
| 51 | + except Error: | ||
| 52 | + logging.critical("Aborting proccess. Use '--debug' for more details.") | ||
| 49 | 53 | ||
| 50 | -if __name__ == '__main__': | ||
| 51 | - cfg = Config() | ||
| 52 | - rec = set_up_recommender(cfg) | ||
| 53 | - user = LocalSystem() | ||
| 54 | - print rec.get_recommendation(user) |
src/config.py
| @@ -50,7 +50,7 @@ class Config(): | @@ -50,7 +50,7 @@ class Config(): | ||
| 50 | """ | 50 | """ |
| 51 | Print usage help. | 51 | Print usage help. |
| 52 | """ | 52 | """ |
| 53 | - print " [ general ]" | 53 | + print "\n [ general ]" |
| 54 | print " -h, --help Print this help" | 54 | print " -h, --help Print this help" |
| 55 | print " -d, --debug Set logging level to debug." | 55 | print " -d, --debug Set logging level to debug." |
| 56 | print " -v, --verbose Set logging level to verbose." | 56 | print " -v, --verbose Set logging level to verbose." |
| @@ -130,9 +130,9 @@ class Config(): | @@ -130,9 +130,9 @@ class Config(): | ||
| 130 | elif o in ("-c", "--config"): | 130 | elif o in ("-c", "--config"): |
| 131 | self.config = p | 131 | self.config = p |
| 132 | elif o in ("-t", "--tagsdb"): | 132 | elif o in ("-t", "--tagsdb"): |
| 133 | - self.tagsdb = p | 133 | + self.tags_db = p |
| 134 | elif o in ("-i", "--tagsindex"): | 134 | elif o in ("-i", "--tagsindex"): |
| 135 | - self.tagsindex = p | 135 | + self.tags_index = p |
| 136 | elif o in ("-r", "--force-reindex"): | 136 | elif o in ("-r", "--force-reindex"): |
| 137 | self.reindex = 1 | 137 | self.reindex = 1 |
| 138 | elif o in ("-a", "--axi"): | 138 | elif o in ("-a", "--axi"): |
src/cross_validation.py
| @@ -39,7 +39,7 @@ def set_up_recommender(cfg): | @@ -39,7 +39,7 @@ def set_up_recommender(cfg): | ||
| 39 | debtags_db = DebtagsDB(cfg.tags_db) | 39 | debtags_db = DebtagsDB(cfg.tags_db) |
| 40 | if not debtags_db.load(): | 40 | if not debtags_db.load(): |
| 41 | logging.error("Could not load DebtagsDB from %s." % cfg.tags_db) | 41 | logging.error("Could not load DebtagsDB from %s." % cfg.tags_db) |
| 42 | - sys.exit(1) | 42 | + raise Error |
| 43 | debtags_index = DebtagsIndex(os.path.expanduser(cfg.tags_index)) | 43 | debtags_index = DebtagsIndex(os.path.expanduser(cfg.tags_index)) |
| 44 | debtags_index.load(debtags_db,cfg.reindex) | 44 | debtags_index.load(debtags_db,cfg.reindex) |
| 45 | app_rec = Recommender(debtags_index) | 45 | app_rec = Recommender(debtags_index) |
src/data.py
| @@ -26,6 +26,8 @@ from debian import debtags | @@ -26,6 +26,8 @@ from debian import debtags | ||
| 26 | import logging | 26 | import logging |
| 27 | import hashlib | 27 | import hashlib |
| 28 | 28 | ||
| 29 | +from error import Error | ||
| 30 | + | ||
| 29 | class Item: | 31 | class Item: |
| 30 | """ """ | 32 | """ """ |
| 31 | 33 | ||
| @@ -51,46 +53,46 @@ class Singleton(object): | @@ -51,46 +53,46 @@ class Singleton(object): | ||
| 51 | cls._inst = object.__new__(cls) | 53 | cls._inst = object.__new__(cls) |
| 52 | return cls._inst | 54 | return cls._inst |
| 53 | 55 | ||
| 54 | -class DebtagsDB(debtags.DB,Singleton): | ||
| 55 | - def __init__(self,path): | ||
| 56 | - self.path = path | 56 | +class TagsXapianIndex(xapian.WritableDatabase,Singleton): |
| 57 | + def __init__(self,cfg): | ||
| 58 | + self.path = os.path.expanduser(cfg.tags_index) | ||
| 59 | + self.db_path = os.path.expanduser(cfg.tags_db) | ||
| 60 | + self.debtags_db = debtags.DB() | ||
| 57 | 61 | ||
| 58 | - def load(self): | 62 | + db = open(self.db_path) |
| 63 | + md5 = hashlib.md5() | ||
| 64 | + md5.update(db.read()) | ||
| 65 | + self.db_md5 = md5.hexdigest() | ||
| 66 | + | ||
| 67 | + self.load_index(cfg.reindex) | ||
| 68 | + | ||
| 69 | + def load_db(self): | ||
| 59 | tag_filter = re.compile(r"^special::.+$|^.+::TODO$") | 70 | tag_filter = re.compile(r"^special::.+$|^.+::TODO$") |
| 60 | try: | 71 | try: |
| 61 | - self.read(open(self.path, "r"), lambda x: not tag_filter.match(x)) | ||
| 62 | - return 1 | ||
| 63 | - except IOError: | ||
| 64 | - logging.error("IOError: could not open debtags file \'%s\'" % | ||
| 65 | - self.path) | ||
| 66 | - return 0 | ||
| 67 | - | ||
| 68 | - def get_relevant_tags(self,pkgs_list,qtd_of_tags): | 72 | + db_file = open(self.db_path, "r") |
| 73 | + self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) | ||
| 74 | + except IOError: #FIXME try is not catching this | ||
| 75 | + logging.error("Could not load DebtagsDB from %s." % self.db_path) | ||
| 76 | + raise Error | ||
| 77 | + | ||
| 78 | + def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): | ||
| 69 | """ | 79 | """ |
| 70 | Return most relevant tags considering a list of packages. | 80 | Return most relevant tags considering a list of packages. |
| 71 | """ | 81 | """ |
| 72 | - relevant_db = self.choose_packages(pkgs_list) | ||
| 73 | - relevance_index = debtags.relevance_index_function(self,relevant_db) | 82 | + if not self.debtags_db.package_count(): |
| 83 | + self.load_db() | ||
| 84 | + relevant_db = self.debtags_db.choose_packages(pkgs_list) | ||
| 85 | + relevance_index = debtags.relevance_index_function(self.debtags_db, | ||
| 86 | + relevant_db) | ||
| 74 | sorted_relevant_tags = sorted(relevant_db.iter_tags(), | 87 | sorted_relevant_tags = sorted(relevant_db.iter_tags(), |
| 75 | lambda a, b: cmp(relevance_index(a), | 88 | lambda a, b: cmp(relevance_index(a), |
| 76 | relevance_index(b))) | 89 | relevance_index(b))) |
| 77 | return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:])) | 90 | return normalize_tags(' '.join(sorted_relevant_tags[-qtd_of_tags:])) |
| 78 | 91 | ||
| 79 | -class DebtagsIndex(xapian.WritableDatabase,Singleton): | ||
| 80 | - def __init__(self,path): | ||
| 81 | - self.path = path | ||
| 82 | - self.db_md5 = 0 | ||
| 83 | - | ||
| 84 | - def load(self,debtags_db,reindex=0): | 92 | + def load_index(self,reindex): |
| 85 | """ | 93 | """ |
| 86 | Load an existing debtags index. | 94 | Load an existing debtags index. |
| 87 | """ | 95 | """ |
| 88 | - self.debtags_db = debtags_db | ||
| 89 | - db = open(debtags_db.path) | ||
| 90 | - md5 = hashlib.md5() | ||
| 91 | - md5.update(db.read()) | ||
| 92 | - self.db_md5 = md5.hexdigest() | ||
| 93 | - | ||
| 94 | if not reindex: | 96 | if not reindex: |
| 95 | try: | 97 | try: |
| 96 | logging.info("Opening existing debtags xapian index at \'%s\'" | 98 | logging.info("Opening existing debtags xapian index at \'%s\'" |
| @@ -105,11 +107,11 @@ class DebtagsIndex(xapian.WritableDatabase,Singleton): | @@ -105,11 +107,11 @@ class DebtagsIndex(xapian.WritableDatabase,Singleton): | ||
| 105 | reindex =1 | 107 | reindex =1 |
| 106 | 108 | ||
| 107 | if reindex: | 109 | if reindex: |
| 108 | - self.create_index(debtags_db) | 110 | + self.new_index() |
| 109 | 111 | ||
| 110 | - def create_index(self,debtags_db): | 112 | + def new_index(self): |
| 111 | """ | 113 | """ |
| 112 | - Create a xapian index for debtags info based on file 'debtags_db' and | 114 | + Create a xapian index for debtags info based on 'debtags_db' and |
| 113 | place it at 'index_path'. | 115 | place it at 'index_path'. |
| 114 | """ | 116 | """ |
| 115 | if not os.path.exists(self.path): | 117 | if not os.path.exists(self.path): |
| @@ -122,10 +124,12 @@ class DebtagsIndex(xapian.WritableDatabase,Singleton): | @@ -122,10 +124,12 @@ class DebtagsIndex(xapian.WritableDatabase,Singleton): | ||
| 122 | xapian.DB_CREATE_OR_OVERWRITE) | 124 | xapian.DB_CREATE_OR_OVERWRITE) |
| 123 | except xapian.DatabaseError: | 125 | except xapian.DatabaseError: |
| 124 | logging.critical("Could not create xapian index.") | 126 | logging.critical("Could not create xapian index.") |
| 125 | - exit(1) | 127 | + raise Error |
| 126 | 128 | ||
| 129 | + self.load_db() | ||
| 127 | self.set_metadata("md5",self.db_md5) | 130 | self.set_metadata("md5",self.db_md5) |
| 128 | - for pkg,tags in debtags_db.iter_packages_tags(): | 131 | + |
| 132 | + for pkg,tags in self.debtags_db.iter_packages_tags(): | ||
| 129 | doc = xapian.Document() | 133 | doc = xapian.Document() |
| 130 | doc.set_data(pkg) | 134 | doc.set_data(pkg) |
| 131 | for tag in tags: | 135 | for tag in tags: |
src/evaluation.py
| @@ -105,7 +105,7 @@ class CrossValidation: | @@ -105,7 +105,7 @@ class CrossValidation: | ||
| 105 | self.partition_proportion = partition_proportion | 105 | self.partition_proportion = partition_proportion |
| 106 | else: | 106 | else: |
| 107 | logging.critical("A proporcao de particao deve ser um avalor ente 0 e 1.") | 107 | logging.critical("A proporcao de particao deve ser um avalor ente 0 e 1.") |
| 108 | - exit(1) | 108 | + raise Error |
| 109 | self.rounds = rounds | 109 | self.rounds = rounds |
| 110 | self.recommender = rec | 110 | self.recommender = rec |
| 111 | self.metrics_list = metrics_list | 111 | self.metrics_list = metrics_list |
| @@ -143,7 +143,7 @@ class CrossValidation: | @@ -143,7 +143,7 @@ class CrossValidation: | ||
| 143 | random_key = random.choice(cross_item_score.keys()) | 143 | random_key = random.choice(cross_item_score.keys()) |
| 144 | else: | 144 | else: |
| 145 | logging.critical("cross_item_score vazio") | 145 | logging.critical("cross_item_score vazio") |
| 146 | - exit(1) | 146 | + raise Error |
| 147 | round_partition[random_key] = cross_item_score.pop(random_key) | 147 | round_partition[random_key] = cross_item_score.pop(random_key) |
| 148 | round_user = User(cross_item_score) | 148 | round_user = User(cross_item_score) |
| 149 | predicted_result = self.recommender.get_recommendation(round_user) | 149 | predicted_result = self.recommender.get_recommendation(round_user) |
src/recommender.py
| @@ -18,6 +18,9 @@ | @@ -18,6 +18,9 @@ | ||
| 18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 19 | 19 | ||
| 20 | from operator import itemgetter | 20 | from operator import itemgetter |
| 21 | +from data import * | ||
| 22 | +from strategy import * | ||
| 23 | +from error import Error | ||
| 21 | 24 | ||
| 22 | class RecommendationResult: | 25 | class RecommendationResult: |
| 23 | def __init__(self,item_score,size): | 26 | def __init__(self,item_score,size): |
| @@ -37,11 +40,22 @@ class RecommendationResult: | @@ -37,11 +40,22 @@ class RecommendationResult: | ||
| 37 | 40 | ||
| 38 | class Recommender: | 41 | class Recommender: |
| 39 | """ """ | 42 | """ """ |
| 40 | - def __init__(self,items_repository,users_repository=None, | ||
| 41 | - knowledge_repository=None): | ||
| 42 | - self.items_repository = items_repository | ||
| 43 | - self.users_repository = users_repository | ||
| 44 | - self.knowledge_repository = knowledge_repository | 43 | + def __init__(self,cfg): |
| 44 | + try: | ||
| 45 | + strategy = "self."+cfg.strategy+"(cfg)" | ||
| 46 | + exec(strategy) | ||
| 47 | + except (NameError, AttributeError, SyntaxError): | ||
| 48 | + logging.critical("Could not perform recommendation strategy '%s'" % | ||
| 49 | + cfg.strategy) | ||
| 50 | + raise Error | ||
| 51 | + | ||
| 52 | + def ct(self,cfg): | ||
| 53 | + self.items_repository = TagsXapianIndex(cfg) | ||
| 54 | + self.strategy = ContentBasedStrategy() | ||
| 55 | + | ||
| 56 | + def cta(self,cfg): | ||
| 57 | + self.items_repository = xapian.Database(cfg.axi) | ||
| 58 | + self.strategy = AxiContentBasedStrategy() | ||
| 45 | 59 | ||
| 46 | def set_strategy(self,strategy): | 60 | def set_strategy(self,strategy): |
| 47 | """ """ | 61 | """ """ |
src/strategy.py
| @@ -20,7 +20,7 @@ | @@ -20,7 +20,7 @@ | ||
| 20 | import os, re | 20 | import os, re |
| 21 | import xapian | 21 | import xapian |
| 22 | from data import * | 22 | from data import * |
| 23 | -from recommender import * | 23 | +import recommender |
| 24 | 24 | ||
| 25 | class ReputationHeuristic: | 25 | class ReputationHeuristic: |
| 26 | """ | 26 | """ |
| @@ -75,50 +75,50 @@ class ContentBasedStrategy(RecommendationStrategy): | @@ -75,50 +75,50 @@ class ContentBasedStrategy(RecommendationStrategy): | ||
| 75 | """ | 75 | """ |
| 76 | Content-based recommendation strategy. | 76 | Content-based recommendation strategy. |
| 77 | """ | 77 | """ |
| 78 | - def run(self,recommender,user): | 78 | + def run(self,rec,user): |
| 79 | """ | 79 | """ |
| 80 | Perform recommendation strategy. | 80 | Perform recommendation strategy. |
| 81 | """ | 81 | """ |
| 82 | - profile = user.debtags_tag_profile(recommender.items_repository.debtags_db,50) | 82 | + profile = user.txi_tag_profile(rec.items_repository,50) |
| 83 | qp = xapian.QueryParser() | 83 | qp = xapian.QueryParser() |
| 84 | query = qp.parse_query(profile) | 84 | query = qp.parse_query(profile) |
| 85 | - enquire = xapian.Enquire(recommender.items_repository) | 85 | + enquire = xapian.Enquire(rec.items_repository) |
| 86 | enquire.set_query(query) | 86 | enquire.set_query(query) |
| 87 | 87 | ||
| 88 | try: | 88 | try: |
| 89 | mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | 89 | mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) |
| 90 | except xapian.DatabaseError as error: | 90 | except xapian.DatabaseError as error: |
| 91 | logging.critical(error.get_msg()) | 91 | logging.critical(error.get_msg()) |
| 92 | - exit(1) | 92 | + raise Error |
| 93 | 93 | ||
| 94 | item_score = {} | 94 | item_score = {} |
| 95 | for m in mset: | 95 | for m in mset: |
| 96 | item_score[m.document.get_data()] = m.rank | 96 | item_score[m.document.get_data()] = m.rank |
| 97 | - return RecommendationResult(item_score,20) | 97 | + return recommender.RecommendationResult(item_score,20) |
| 98 | 98 | ||
| 99 | class AxiContentBasedStrategy(RecommendationStrategy): | 99 | class AxiContentBasedStrategy(RecommendationStrategy): |
| 100 | """ | 100 | """ |
| 101 | Content-based recommendation strategy based on Apt-xapian-index. | 101 | Content-based recommendation strategy based on Apt-xapian-index. |
| 102 | """ | 102 | """ |
| 103 | - def run(self,recommender,user): | 103 | + def run(self,rec,user): |
| 104 | """ | 104 | """ |
| 105 | Perform recommendation strategy. | 105 | Perform recommendation strategy. |
| 106 | """ | 106 | """ |
| 107 | - profile = user.axi_tag_profile(recommender.items_repository,50) | 107 | + profile = user.axi_tag_profile(rec.items_repository,50) |
| 108 | query = xapian.Query(xapian.Query.OP_OR,profile) | 108 | query = xapian.Query(xapian.Query.OP_OR,profile) |
| 109 | - enquire = xapian.Enquire(recommender.items_repository) | 109 | + enquire = xapian.Enquire(rec.items_repository) |
| 110 | enquire.set_query(query) | 110 | enquire.set_query(query) |
| 111 | 111 | ||
| 112 | try: | 112 | try: |
| 113 | mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | 113 | mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) |
| 114 | except xapian.DatabaseError as error: | 114 | except xapian.DatabaseError as error: |
| 115 | logging.critical(error.get_msg()) | 115 | logging.critical(error.get_msg()) |
| 116 | - exit(1) | 116 | + raise Error |
| 117 | 117 | ||
| 118 | item_score = {} | 118 | item_score = {} |
| 119 | for m in mset: | 119 | for m in mset: |
| 120 | item_score[m.document.get_data()] = m.rank | 120 | item_score[m.document.get_data()] = m.rank |
| 121 | - return RecommendationResult(item_score,20) | 121 | + return recommender.RecommendationResult(item_score,20) |
| 122 | 122 | ||
| 123 | class ColaborativeStrategy(RecommendationStrategy): | 123 | class ColaborativeStrategy(RecommendationStrategy): |
| 124 | """ | 124 | """ |
src/user.py
| @@ -39,12 +39,12 @@ class User: | @@ -39,12 +39,12 @@ class User: | ||
| 39 | def items(self): | 39 | def items(self): |
| 40 | return self.item_score.keys() | 40 | return self.item_score.keys() |
| 41 | 41 | ||
| 42 | - def axi_tag_profile(self,xapian_db,profile_size): | 42 | + def axi_tag_profile(self,apt_xapian_index,profile_size): |
| 43 | terms = [] | 43 | terms = [] |
| 44 | for item in self.items(): | 44 | for item in self.items(): |
| 45 | terms.append("XP"+item) | 45 | terms.append("XP"+item) |
| 46 | query = xapian.Query(xapian.Query.OP_OR, terms) | 46 | query = xapian.Query(xapian.Query.OP_OR, terms) |
| 47 | - enquire = xapian.Enquire(xapian_db) | 47 | + enquire = xapian.Enquire(apt_xapian_index) |
| 48 | enquire.set_query(query) | 48 | enquire.set_query(query) |
| 49 | rset = xapian.RSet() | 49 | rset = xapian.RSet() |
| 50 | for m in enquire.get_mset(0,30000): #consider all matches | 50 | for m in enquire.get_mset(0,30000): #consider all matches |
| @@ -56,8 +56,9 @@ class User: | @@ -56,8 +56,9 @@ class User: | ||
| 56 | logging.debug("%.2f %s" % (res.weight,res.term[2:])) | 56 | logging.debug("%.2f %s" % (res.weight,res.term[2:])) |
| 57 | return profile | 57 | return profile |
| 58 | 58 | ||
| 59 | - def debtags_tag_profile(self,debtags_db,profile_size): | ||
| 60 | - return debtags_db.get_relevant_tags(self.items(),profile_size) | 59 | + def txi_tag_profile(self,tags_xapian_index,profile_size): |
| 60 | + return tags_xapian_index.relevant_tags_from_db(self.items(), | ||
| 61 | + profile_size) | ||
| 61 | 62 | ||
| 62 | class LocalSystem(User): | 63 | class LocalSystem(User): |
| 63 | """ """ | 64 | """ """ |