Commit 43742989ec7b23b05f0d73198a287523fc52cdbc
1 parent
49638292
Exists in
master
and in
1 other branch
Introduced configuration option 'reindex' that can be set using '-r' or
'--force-reindex' in command line. If reindex is not set, it tries to reuse an existing index and creates a new one only if debtags database has been modified or an error occurs when opening the index. (close #8)
Showing
4 changed files
with
50 additions
and
17 deletions
Show diff stats
src/app_recommender.py
| @@ -30,8 +30,6 @@ from strategy import * | @@ -30,8 +30,6 @@ from strategy import * | ||
| 30 | from user import * | 30 | from user import * |
| 31 | 31 | ||
| 32 | def set_up_recommender(cfg): | 32 | def set_up_recommender(cfg): |
| 33 | - reindex = 1 #FIXME should do it only if necessary | ||
| 34 | - | ||
| 35 | if cfg.strategy == "cta": | 33 | if cfg.strategy == "cta": |
| 36 | axi_db = xapian.Database(cfg.axi) | 34 | axi_db = xapian.Database(cfg.axi) |
| 37 | app_rec = Recommender(axi_db) | 35 | app_rec = Recommender(axi_db) |
| @@ -43,7 +41,7 @@ def set_up_recommender(cfg): | @@ -43,7 +41,7 @@ def set_up_recommender(cfg): | ||
| 43 | logging.error("Could not load DebtagsDB from %s." % cfg.tags_db) | 41 | logging.error("Could not load DebtagsDB from %s." % cfg.tags_db) |
| 44 | sys.exit(1) | 42 | sys.exit(1) |
| 45 | debtags_index = DebtagsIndex(os.path.expanduser(cfg.tags_index)) | 43 | debtags_index = DebtagsIndex(os.path.expanduser(cfg.tags_index)) |
| 46 | - debtags_index.load(debtags_db,reindex) | 44 | + debtags_index.load(debtags_db,cfg.reindex) |
| 47 | app_rec = Recommender(debtags_index) | 45 | app_rec = Recommender(debtags_index) |
| 48 | app_rec.set_strategy(ContentBasedStrategy()) | 46 | app_rec.set_strategy(ContentBasedStrategy()) |
| 49 | 47 |
src/config.py
| @@ -42,6 +42,7 @@ class Config(): | @@ -42,6 +42,7 @@ class Config(): | ||
| 42 | self.axi = "/var/lib/apt-xapian-index/index" | 42 | self.axi = "/var/lib/apt-xapian-index/index" |
| 43 | self.axi_values = "/var/lib/apt-xapian-index/values" | 43 | self.axi_values = "/var/lib/apt-xapian-index/values" |
| 44 | self.strategy = "ct" # defaults to the cheapest one | 44 | self.strategy = "ct" # defaults to the cheapest one |
| 45 | + self.reindex = 0 | ||
| 45 | 46 | ||
| 46 | def usage(self): | 47 | def usage(self): |
| 47 | """ | 48 | """ |
| @@ -57,6 +58,7 @@ class Config(): | @@ -57,6 +58,7 @@ class Config(): | ||
| 57 | print " [ recommender ]" | 58 | print " [ recommender ]" |
| 58 | print " -t, --tagsdb=PATH Path to debtags database." | 59 | print " -t, --tagsdb=PATH Path to debtags database." |
| 59 | print " -i, --tagsindex=PATH Path to debtags dedicated index." | 60 | print " -i, --tagsindex=PATH Path to debtags dedicated index." |
| 61 | + print " -r, --force-reindex Force reindexing debtags database." | ||
| 60 | print " -a, --axi=PATH Path to Apt-xapian-index." | 62 | print " -a, --axi=PATH Path to Apt-xapian-index." |
| 61 | print " -s, --strategy=OPTION Recommendation strategy." | 63 | print " -s, --strategy=OPTION Recommendation strategy." |
| 62 | print "" | 64 | print "" |
| @@ -98,17 +100,18 @@ class Config(): | @@ -98,17 +100,18 @@ class Config(): | ||
| 98 | 100 | ||
| 99 | self.tags_db = self.read_option('recommender', 'tags_db') | 101 | self.tags_db = self.read_option('recommender', 'tags_db') |
| 100 | self.tags_index = self.read_option('recommender', 'tags_index') | 102 | self.tags_index = self.read_option('recommender', 'tags_index') |
| 103 | + self.reindex = self.read_option('recommender', 'reindex') | ||
| 101 | self.axi = self.read_option('recommender', 'axi') | 104 | self.axi = self.read_option('recommender', 'axi') |
| 102 | 105 | ||
| 103 | - short_options = "hdvo:c:t:i:a:s:" | 106 | + short_options = "hdvo:c:t:i:ra:s:" |
| 104 | long_options = ["help", "debug", "verbose", "output=", "config=", | 107 | long_options = ["help", "debug", "verbose", "output=", "config=", |
| 105 | - "tagsdb=", "tagsindex=", "axi=", "strategy="] | 108 | + "tagsdb=", "tagsindex=", "reindex", "axi=", "strategy="] |
| 106 | try: | 109 | try: |
| 107 | opts, args = getopt.getopt(sys.argv[1:], short_options, | 110 | opts, args = getopt.getopt(sys.argv[1:], short_options, |
| 108 | long_options) | 111 | long_options) |
| 109 | - except getopt.GetoptError, err: | ||
| 110 | - logging.error("Error parsing args: %s", str(err)) | ||
| 111 | - print "Syntax error" | 112 | + except getopt.GetoptError as error: |
| 113 | + self.set_logger() | ||
| 114 | + logging.error("Bad syntax: %s" % str(error)) | ||
| 112 | self.usage() | 115 | self.usage() |
| 113 | sys.exit() | 116 | sys.exit() |
| 114 | 117 | ||
| @@ -128,6 +131,8 @@ class Config(): | @@ -128,6 +131,8 @@ class Config(): | ||
| 128 | self.tagsdb = p | 131 | self.tagsdb = p |
| 129 | elif o in ("-i", "--tagsindex"): | 132 | elif o in ("-i", "--tagsindex"): |
| 130 | self.tagsindex = p | 133 | self.tagsindex = p |
| 134 | + elif o in ("-r", "--force-reindex"): | ||
| 135 | + self.reindex = 1 | ||
| 131 | elif o in ("-a", "--axi"): | 136 | elif o in ("-a", "--axi"): |
| 132 | self.axi = p + "/index" | 137 | self.axi = p + "/index" |
| 133 | self.axi_values = p + "/values" | 138 | self.axi_values = p + "/values" |
src/data.py
| @@ -24,6 +24,7 @@ import xapian | @@ -24,6 +24,7 @@ import xapian | ||
| 24 | import axi | 24 | import axi |
| 25 | from debian import debtags | 25 | from debian import debtags |
| 26 | import logging | 26 | import logging |
| 27 | +import hashlib | ||
| 27 | 28 | ||
| 28 | class Item: | 29 | class Item: |
| 29 | """ """ | 30 | """ """ |
| @@ -74,33 +75,52 @@ class DebtagsDB(debtags.DB): | @@ -74,33 +75,52 @@ class DebtagsDB(debtags.DB): | ||
| 74 | class DebtagsIndex(xapian.WritableDatabase): | 75 | class DebtagsIndex(xapian.WritableDatabase): |
| 75 | def __init__(self,path): | 76 | def __init__(self,path): |
| 76 | self.path = path | 77 | self.path = path |
| 78 | + self.db_md5 = 0 | ||
| 77 | 79 | ||
| 78 | - def load(self,debtags_db,reindex): | 80 | + def load(self,debtags_db,reindex=0): |
| 79 | """ | 81 | """ |
| 80 | Load an existing debtags index. | 82 | Load an existing debtags index. |
| 81 | """ | 83 | """ |
| 82 | self.debtags_db = debtags_db | 84 | self.debtags_db = debtags_db |
| 85 | + db = open(debtags_db.path) | ||
| 86 | + md5 = hashlib.md5() | ||
| 87 | + md5.update(db.read()) | ||
| 88 | + self.db_md5 = md5.hexdigest() | ||
| 89 | + | ||
| 83 | if not reindex: | 90 | if not reindex: |
| 84 | try: | 91 | try: |
| 85 | logging.info("Opening existing debtags xapian index at \'%s\'" | 92 | logging.info("Opening existing debtags xapian index at \'%s\'" |
| 86 | % self.path) | 93 | % self.path) |
| 87 | xapian.Database.__init__(self,self.path) | 94 | xapian.Database.__init__(self,self.path) |
| 95 | + md5 = self.get_metadata("md5") | ||
| 96 | + if not md5 == self.db_md5: | ||
| 97 | + logging.info("Index must be updated.") | ||
| 98 | + reindex = 1 | ||
| 88 | except xapian.DatabaseError: | 99 | except xapian.DatabaseError: |
| 89 | - logging.error("Could not open debtags xapian index") | 100 | + logging.info("Could not open index.") |
| 90 | reindex =1 | 101 | reindex =1 |
| 102 | + | ||
| 91 | if reindex: | 103 | if reindex: |
| 92 | - self.reindex(debtags_db) | 104 | + self.create_index(debtags_db) |
| 93 | 105 | ||
| 94 | - def reindex(self,debtags_db): | 106 | + def create_index(self,debtags_db): |
| 95 | """ | 107 | """ |
| 96 | Create a xapian index for debtags info based on file 'debtags_db' and | 108 | Create a xapian index for debtags info based on file 'debtags_db' and |
| 97 | place it at 'index_path'. | 109 | place it at 'index_path'. |
| 98 | """ | 110 | """ |
| 99 | if not os.path.exists(self.path): | 111 | if not os.path.exists(self.path): |
| 100 | os.makedirs(self.path) | 112 | os.makedirs(self.path) |
| 101 | - logging.info("Creating new debtags xapian index at \'%s\'" % self.path) | ||
| 102 | - xapian.WritableDatabase.__init__(self,self.path, | ||
| 103 | - xapian.DB_CREATE_OR_OVERWRITE) | 113 | + |
| 114 | + try: | ||
| 115 | + logging.info("Creating new xapian index for debtags at \'%s\'" % | ||
| 116 | + self.path) | ||
| 117 | + xapian.WritableDatabase.__init__(self,self.path, | ||
| 118 | + xapian.DB_CREATE_OR_OVERWRITE) | ||
| 119 | + except xapian.DatabaseError: | ||
| 120 | + logging.critical("Could not create xapian index.") | ||
| 121 | + exit(1) | ||
| 122 | + | ||
| 123 | + self.set_metadata("md5",self.db_md5) | ||
| 104 | for pkg,tags in debtags_db.iter_packages_tags(): | 124 | for pkg,tags in debtags_db.iter_packages_tags(): |
| 105 | doc = xapian.Document() | 125 | doc = xapian.Document() |
| 106 | doc.set_data(pkg) | 126 | doc.set_data(pkg) |
src/strategy.py
| @@ -85,7 +85,12 @@ class ContentBasedStrategy(RecommendationStrategy): | @@ -85,7 +85,12 @@ class ContentBasedStrategy(RecommendationStrategy): | ||
| 85 | enquire = xapian.Enquire(recommender.items_repository) | 85 | enquire = xapian.Enquire(recommender.items_repository) |
| 86 | enquire.set_query(query) | 86 | enquire.set_query(query) |
| 87 | 87 | ||
| 88 | - mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | 88 | + try: |
| 89 | + mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | ||
| 90 | + except xapian.DatabaseError as error: | ||
| 91 | + logging.critical(error.get_msg()) | ||
| 92 | + exit(1) | ||
| 93 | + | ||
| 89 | item_score = {} | 94 | item_score = {} |
| 90 | for m in mset: | 95 | for m in mset: |
| 91 | item_score[m.document.get_data()] = m.rank | 96 | item_score[m.document.get_data()] = m.rank |
| @@ -104,7 +109,12 @@ class AxiContentBasedStrategy(RecommendationStrategy): | @@ -104,7 +109,12 @@ class AxiContentBasedStrategy(RecommendationStrategy): | ||
| 104 | enquire = xapian.Enquire(recommender.items_repository) | 109 | enquire = xapian.Enquire(recommender.items_repository) |
| 105 | enquire.set_query(query) | 110 | enquire.set_query(query) |
| 106 | 111 | ||
| 107 | - mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | 112 | + try: |
| 113 | + mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | ||
| 114 | + except xapian.DatabaseError as error: | ||
| 115 | + logging.critical(error.get_msg()) | ||
| 116 | + exit(1) | ||
| 117 | + | ||
| 108 | item_score = {} | 118 | item_score = {} |
| 109 | for m in mset: | 119 | for m in mset: |
| 110 | item_score[m.document.get_data()] = m.rank | 120 | item_score[m.document.get_data()] = m.rank |