Commit 43742989ec7b23b05f0d73198a287523fc52cdbc
1 parent
49638292
Exists in
master
and in
1 other branch
Introduced configuration option 'reindex' that can be set using '-r' or
'--force-reindex' in command line. If reindex is not set, it tries to reuse an existing index and creates a new one only if debtags database has been modified or an error occurs when opening the index. (close #8)
Showing
4 changed files
with
50 additions
and
17 deletions
Show diff stats
src/app_recommender.py
@@ -30,8 +30,6 @@ from strategy import * | @@ -30,8 +30,6 @@ from strategy import * | ||
30 | from user import * | 30 | from user import * |
31 | 31 | ||
32 | def set_up_recommender(cfg): | 32 | def set_up_recommender(cfg): |
33 | - reindex = 1 #FIXME should do it only if necessary | ||
34 | - | ||
35 | if cfg.strategy == "cta": | 33 | if cfg.strategy == "cta": |
36 | axi_db = xapian.Database(cfg.axi) | 34 | axi_db = xapian.Database(cfg.axi) |
37 | app_rec = Recommender(axi_db) | 35 | app_rec = Recommender(axi_db) |
@@ -43,7 +41,7 @@ def set_up_recommender(cfg): | @@ -43,7 +41,7 @@ def set_up_recommender(cfg): | ||
43 | logging.error("Could not load DebtagsDB from %s." % cfg.tags_db) | 41 | logging.error("Could not load DebtagsDB from %s." % cfg.tags_db) |
44 | sys.exit(1) | 42 | sys.exit(1) |
45 | debtags_index = DebtagsIndex(os.path.expanduser(cfg.tags_index)) | 43 | debtags_index = DebtagsIndex(os.path.expanduser(cfg.tags_index)) |
46 | - debtags_index.load(debtags_db,reindex) | 44 | + debtags_index.load(debtags_db,cfg.reindex) |
47 | app_rec = Recommender(debtags_index) | 45 | app_rec = Recommender(debtags_index) |
48 | app_rec.set_strategy(ContentBasedStrategy()) | 46 | app_rec.set_strategy(ContentBasedStrategy()) |
49 | 47 |
src/config.py
@@ -42,6 +42,7 @@ class Config(): | @@ -42,6 +42,7 @@ class Config(): | ||
42 | self.axi = "/var/lib/apt-xapian-index/index" | 42 | self.axi = "/var/lib/apt-xapian-index/index" |
43 | self.axi_values = "/var/lib/apt-xapian-index/values" | 43 | self.axi_values = "/var/lib/apt-xapian-index/values" |
44 | self.strategy = "ct" # defaults to the cheapest one | 44 | self.strategy = "ct" # defaults to the cheapest one |
45 | + self.reindex = 0 | ||
45 | 46 | ||
46 | def usage(self): | 47 | def usage(self): |
47 | """ | 48 | """ |
@@ -57,6 +58,7 @@ class Config(): | @@ -57,6 +58,7 @@ class Config(): | ||
57 | print " [ recommender ]" | 58 | print " [ recommender ]" |
58 | print " -t, --tagsdb=PATH Path to debtags database." | 59 | print " -t, --tagsdb=PATH Path to debtags database." |
59 | print " -i, --tagsindex=PATH Path to debtags dedicated index." | 60 | print " -i, --tagsindex=PATH Path to debtags dedicated index." |
61 | + print " -r, --force-reindex Force reindexing debtags database." | ||
60 | print " -a, --axi=PATH Path to Apt-xapian-index." | 62 | print " -a, --axi=PATH Path to Apt-xapian-index." |
61 | print " -s, --strategy=OPTION Recommendation strategy." | 63 | print " -s, --strategy=OPTION Recommendation strategy." |
62 | print "" | 64 | print "" |
@@ -98,17 +100,18 @@ class Config(): | @@ -98,17 +100,18 @@ class Config(): | ||
98 | 100 | ||
99 | self.tags_db = self.read_option('recommender', 'tags_db') | 101 | self.tags_db = self.read_option('recommender', 'tags_db') |
100 | self.tags_index = self.read_option('recommender', 'tags_index') | 102 | self.tags_index = self.read_option('recommender', 'tags_index') |
103 | + self.reindex = self.read_option('recommender', 'reindex') | ||
101 | self.axi = self.read_option('recommender', 'axi') | 104 | self.axi = self.read_option('recommender', 'axi') |
102 | 105 | ||
103 | - short_options = "hdvo:c:t:i:a:s:" | 106 | + short_options = "hdvo:c:t:i:ra:s:" |
104 | long_options = ["help", "debug", "verbose", "output=", "config=", | 107 | long_options = ["help", "debug", "verbose", "output=", "config=", |
105 | - "tagsdb=", "tagsindex=", "axi=", "strategy="] | 108 | + "tagsdb=", "tagsindex=", "reindex", "axi=", "strategy="] |
106 | try: | 109 | try: |
107 | opts, args = getopt.getopt(sys.argv[1:], short_options, | 110 | opts, args = getopt.getopt(sys.argv[1:], short_options, |
108 | long_options) | 111 | long_options) |
109 | - except getopt.GetoptError, err: | ||
110 | - logging.error("Error parsing args: %s", str(err)) | ||
111 | - print "Syntax error" | 112 | + except getopt.GetoptError as error: |
113 | + self.set_logger() | ||
114 | + logging.error("Bad syntax: %s" % str(error)) | ||
112 | self.usage() | 115 | self.usage() |
113 | sys.exit() | 116 | sys.exit() |
114 | 117 | ||
@@ -128,6 +131,8 @@ class Config(): | @@ -128,6 +131,8 @@ class Config(): | ||
128 | self.tagsdb = p | 131 | self.tagsdb = p |
129 | elif o in ("-i", "--tagsindex"): | 132 | elif o in ("-i", "--tagsindex"): |
130 | self.tagsindex = p | 133 | self.tagsindex = p |
134 | + elif o in ("-r", "--force-reindex"): | ||
135 | + self.reindex = 1 | ||
131 | elif o in ("-a", "--axi"): | 136 | elif o in ("-a", "--axi"): |
132 | self.axi = p + "/index" | 137 | self.axi = p + "/index" |
133 | self.axi_values = p + "/values" | 138 | self.axi_values = p + "/values" |
src/data.py
@@ -24,6 +24,7 @@ import xapian | @@ -24,6 +24,7 @@ import xapian | ||
24 | import axi | 24 | import axi |
25 | from debian import debtags | 25 | from debian import debtags |
26 | import logging | 26 | import logging |
27 | +import hashlib | ||
27 | 28 | ||
28 | class Item: | 29 | class Item: |
29 | """ """ | 30 | """ """ |
@@ -74,33 +75,52 @@ class DebtagsDB(debtags.DB): | @@ -74,33 +75,52 @@ class DebtagsDB(debtags.DB): | ||
74 | class DebtagsIndex(xapian.WritableDatabase): | 75 | class DebtagsIndex(xapian.WritableDatabase): |
75 | def __init__(self,path): | 76 | def __init__(self,path): |
76 | self.path = path | 77 | self.path = path |
78 | + self.db_md5 = 0 | ||
77 | 79 | ||
78 | - def load(self,debtags_db,reindex): | 80 | + def load(self,debtags_db,reindex=0): |
79 | """ | 81 | """ |
80 | Load an existing debtags index. | 82 | Load an existing debtags index. |
81 | """ | 83 | """ |
82 | self.debtags_db = debtags_db | 84 | self.debtags_db = debtags_db |
85 | + db = open(debtags_db.path) | ||
86 | + md5 = hashlib.md5() | ||
87 | + md5.update(db.read()) | ||
88 | + self.db_md5 = md5.hexdigest() | ||
89 | + | ||
83 | if not reindex: | 90 | if not reindex: |
84 | try: | 91 | try: |
85 | logging.info("Opening existing debtags xapian index at \'%s\'" | 92 | logging.info("Opening existing debtags xapian index at \'%s\'" |
86 | % self.path) | 93 | % self.path) |
87 | xapian.Database.__init__(self,self.path) | 94 | xapian.Database.__init__(self,self.path) |
95 | + md5 = self.get_metadata("md5") | ||
96 | + if not md5 == self.db_md5: | ||
97 | + logging.info("Index must be updated.") | ||
98 | + reindex = 1 | ||
88 | except xapian.DatabaseError: | 99 | except xapian.DatabaseError: |
89 | - logging.error("Could not open debtags xapian index") | 100 | + logging.info("Could not open index.") |
90 | reindex =1 | 101 | reindex =1 |
102 | + | ||
91 | if reindex: | 103 | if reindex: |
92 | - self.reindex(debtags_db) | 104 | + self.create_index(debtags_db) |
93 | 105 | ||
94 | - def reindex(self,debtags_db): | 106 | + def create_index(self,debtags_db): |
95 | """ | 107 | """ |
96 | Create a xapian index for debtags info based on file 'debtags_db' and | 108 | Create a xapian index for debtags info based on file 'debtags_db' and |
97 | place it at 'index_path'. | 109 | place it at 'index_path'. |
98 | """ | 110 | """ |
99 | if not os.path.exists(self.path): | 111 | if not os.path.exists(self.path): |
100 | os.makedirs(self.path) | 112 | os.makedirs(self.path) |
101 | - logging.info("Creating new debtags xapian index at \'%s\'" % self.path) | ||
102 | - xapian.WritableDatabase.__init__(self,self.path, | ||
103 | - xapian.DB_CREATE_OR_OVERWRITE) | 113 | + |
114 | + try: | ||
115 | + logging.info("Creating new xapian index for debtags at \'%s\'" % | ||
116 | + self.path) | ||
117 | + xapian.WritableDatabase.__init__(self,self.path, | ||
118 | + xapian.DB_CREATE_OR_OVERWRITE) | ||
119 | + except xapian.DatabaseError: | ||
120 | + logging.critical("Could not create xapian index.") | ||
121 | + exit(1) | ||
122 | + | ||
123 | + self.set_metadata("md5",self.db_md5) | ||
104 | for pkg,tags in debtags_db.iter_packages_tags(): | 124 | for pkg,tags in debtags_db.iter_packages_tags(): |
105 | doc = xapian.Document() | 125 | doc = xapian.Document() |
106 | doc.set_data(pkg) | 126 | doc.set_data(pkg) |
src/strategy.py
@@ -85,7 +85,12 @@ class ContentBasedStrategy(RecommendationStrategy): | @@ -85,7 +85,12 @@ class ContentBasedStrategy(RecommendationStrategy): | ||
85 | enquire = xapian.Enquire(recommender.items_repository) | 85 | enquire = xapian.Enquire(recommender.items_repository) |
86 | enquire.set_query(query) | 86 | enquire.set_query(query) |
87 | 87 | ||
88 | - mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | 88 | + try: |
89 | + mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | ||
90 | + except xapian.DatabaseError as error: | ||
91 | + logging.critical(error.get_msg()) | ||
92 | + exit(1) | ||
93 | + | ||
89 | item_score = {} | 94 | item_score = {} |
90 | for m in mset: | 95 | for m in mset: |
91 | item_score[m.document.get_data()] = m.rank | 96 | item_score[m.document.get_data()] = m.rank |
@@ -104,7 +109,12 @@ class AxiContentBasedStrategy(RecommendationStrategy): | @@ -104,7 +109,12 @@ class AxiContentBasedStrategy(RecommendationStrategy): | ||
104 | enquire = xapian.Enquire(recommender.items_repository) | 109 | enquire = xapian.Enquire(recommender.items_repository) |
105 | enquire.set_query(query) | 110 | enquire.set_query(query) |
106 | 111 | ||
107 | - mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | 112 | + try: |
113 | + mset = enquire.get_mset(0, 20, None, PkgMatchDecider(user.items())) | ||
114 | + except xapian.DatabaseError as error: | ||
115 | + logging.critical(error.get_msg()) | ||
116 | + exit(1) | ||
117 | + | ||
108 | item_score = {} | 118 | item_score = {} |
109 | for m in mset: | 119 | for m in mset: |
110 | item_score[m.document.get_data()] = m.rank | 120 | item_score[m.document.get_data()] = m.rank |