Commit f1b691fb888d2df5d82dc2aee04244c86b18b2ff
1 parent
ccdace0e
Exists in
master
and in
1 other branch
Updated indexer scripts and data classes.
Showing
3 changed files
with
230 additions
and
10 deletions
Show diff stats
... | ... | @@ -0,0 +1,77 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + indexer.py - generate xapian indexes to be used as items and users | |
4 | + repositories | |
5 | +""" | |
6 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
7 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
8 | +__license__ = """ | |
9 | + This program is free software: you can redistribute it and/or modify | |
10 | + it under the terms of the GNU General Public License as published by | |
11 | + the Free Software Foundation, either version 3 of the License, or | |
12 | + (at your option) any later version. | |
13 | + | |
14 | + This program is distributed in the hope that it will be useful, | |
15 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | + GNU General Public License for more details. | |
18 | + | |
19 | + You should have received a copy of the GNU General Public License | |
20 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
21 | +""" | |
22 | + | |
23 | +import os | |
24 | +import sys | |
25 | +sys.path.insert(0,'../') | |
26 | +import datetime | |
27 | + | |
28 | +from config import Config | |
29 | +from error import Error | |
30 | +import data | |
31 | +import xapian | |
32 | + | |
33 | +if __name__ == '__main__': | |
34 | + axi_path = "/var/lib/apt-xapian-index/index" | |
35 | + axi = xapian.Database(axi_path) | |
36 | + base_dir = os.path.expanduser("~/.app-recommender/") | |
37 | + | |
38 | + begin_time = datetime.datetime.now() | |
39 | + | |
40 | + # axi sample based on the pkgs sample provided by command line | |
41 | + if "sample" in sys.argv: | |
42 | + print ("Sample package indexing started at %s" % begin_time) | |
43 | + if len(sys.argv) > 2: | |
44 | + pkgs_filter = sys.argv[2] | |
45 | + else: | |
46 | + print "Usage: indexer axi_sample pkgs_sample_file" | |
47 | + exit(1) | |
48 | + with open(pkgs_filter) as valid: | |
49 | + pkgs_list = [line.strip() for line in valid] | |
50 | + filter_str = pkgs_filter.split("/")[-1] | |
51 | + index = data.SampleAptXapianIndex(pkgs_list,axi, | |
52 | + os.path.join(base_dir,"axi_"+filter_str)) | |
53 | + print ("Axi size: %d" % axi.get_doccount()) | |
54 | + print ("Packages list length: %d" % len(pkgs_list)) | |
55 | + print ("Sample index size: %d" % | |
56 | + index.get_doccount()) | |
57 | + | |
58 | + # axi filtered by terms provided by command line | |
59 | + if "filter" in sys.argv: | |
60 | + print ("Filtered package indexing started at %s" % begin_time) | |
61 | + if len(sys.argv) > 2: | |
62 | + terms = sys.argv[2:] | |
63 | + else: | |
64 | + print ("Usage: indexer axi_filter term [additional terms]") | |
65 | + exit(1) | |
66 | + terms_str = "_".join([t.split("::")[-1] for t in terms]) | |
67 | + index = data.FilteredXapianIndex(terms,axi, | |
68 | + os.path.join(base_dir,"axi_"+terms_str)) | |
69 | + print ("Axi size: %d" % axi.get_doccount()) | |
70 | + print ("Terms filter: %s" % terms) | |
71 | + print ("Filtered index size: %d" % | |
72 | + index.get_doccount()) | |
73 | + | |
74 | + end_time = datetime.datetime.now() | |
75 | + print ("Indexing completed at %s" % end_time) | |
76 | + delta = end_time - begin_time | |
77 | + print ("Time elapsed: %d seconds." % delta.seconds) | ... | ... |
... | ... | @@ -0,0 +1,52 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + popindex.py - generate a popcon index to be used by the recommender as the | |
4 | + users repository, based on filters provided by config | |
5 | +""" | |
6 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
7 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
8 | +__license__ = """ | |
9 | + This program is free software: you can redistribute it and/or modify | |
10 | + it under the terms of the GNU General Public License as published by | |
11 | + the Free Software Foundation, either version 3 of the License, or | |
12 | + (at your option) any later version. | |
13 | + | |
14 | + This program is distributed in the hope that it will be useful, | |
15 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | + GNU General Public License for more details. | |
18 | + | |
19 | + You should have received a copy of the GNU General Public License | |
20 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
21 | +""" | |
22 | +import os | |
23 | +import sys | |
24 | +sys.path.insert(0,'../') | |
25 | +import logging | |
26 | +import datetime | |
27 | + | |
28 | +from config import Config | |
29 | +from data import FilteredPopconXapianIndex | |
30 | + | |
31 | +if __name__ == '__main__': | |
32 | + base_dir = os.path.expanduser("~/.app-recommender/") | |
33 | + axi_path = os.path.join(base_dir,"axi_XD") | |
34 | + path = os.path.join(base_dir,"popcon_XD") | |
35 | + popcon_dir = os.path.join(base_dir,"popcon-entries") | |
36 | + tags_filter = os.path.join(base_dir,"filters/debtags") | |
37 | + | |
38 | + # set up config for logging | |
39 | + cfg = Config() | |
40 | + | |
41 | + begin_time = datetime.datetime.now() | |
42 | + logging.info("Popcon indexing started at %s" % begin_time) | |
43 | + # use config file or command line options | |
44 | + index = FilteredPopconXapianIndex(path,popcon_dir,axi_path,tags_filter) | |
45 | + | |
46 | + end_time = datetime.datetime.now() | |
47 | + logging.info("Popcon indexing completed at %s" % end_time) | |
48 | + logging.info("Number of documents (submissions): %d" % | |
49 | + index.get_doccount()) | |
50 | + | |
51 | + delta = end_time - begin_time | |
52 | + logging.info("Time elapsed: %d seconds." % delta.seconds) | ... | ... |
src/data.py
... | ... | @@ -36,6 +36,18 @@ from singleton import Singleton |
36 | 36 | from dissimilarity import * |
37 | 37 | from config import Config |
38 | 38 | |
39 | +def axi_get_pkgs(axi): | |
40 | + pkgs_names = [] | |
41 | + for docid in range(1,axi.get_lastdocid()+1): | |
42 | + try: | |
43 | + doc = axi.get_document(docid) | |
44 | + except: | |
45 | + pass | |
46 | + docterms_XP = [t.term for t in doc.termlist() | |
47 | + if t.term.startswith("XP")] | |
48 | + pkgs_names.append(docterms_XP[0].lstrip('XP')) | |
49 | + return pkgs_names | |
50 | + | |
39 | 51 | def axi_search_pkgs(axi,pkgs_list): |
40 | 52 | terms = ["XP"+item for item in pkgs_list] |
41 | 53 | query = xapian.Query(xapian.Query.OP_OR, terms) |
... | ... | @@ -106,27 +118,32 @@ def tfidf_plus(index,docs,content_filter): |
106 | 118 | """ |
107 | 119 | return tfidf_weighting(index,docs,content_filter,1) |
108 | 120 | |
109 | -class AppAptXapianIndex(xapian.WritableDatabase): | |
121 | +class FilteredXapianIndex(xapian.WritableDatabase): | |
110 | 122 | """ |
111 | - Data source for application packages information | |
123 | + Filtered Xapian Index | |
112 | 124 | """ |
113 | - def __init__(self,axi_path,path): | |
125 | + def __init__(self,terms,index_path,path): | |
114 | 126 | xapian.WritableDatabase.__init__(self,path, |
115 | 127 | xapian.DB_CREATE_OR_OVERWRITE) |
116 | - axi = xapian.Database(axi_path) | |
117 | - logging.info("AptXapianIndex size: %d" % axi.get_doccount()) | |
118 | - for docid in range(1,axi.get_lastdocid()+1): | |
128 | + index = xapian.Database(index_path) | |
129 | + for docid in range(1,index.get_lastdocid()+1): | |
119 | 130 | try: |
120 | - doc = axi.get_document(docid) | |
121 | - allterms = [term.term for term in doc.termlist()] | |
122 | - if "XTrole::program" in allterms: | |
131 | + doc = index.get_document(docid) | |
132 | + docterms = [term.term for term in doc.termlist()] | |
133 | + tagged = False | |
134 | + for t in terms: | |
135 | + if t in docterms: | |
136 | + tagged = True | |
137 | + if tagged: | |
123 | 138 | self.add_document(doc) |
124 | 139 | logging.info("Added doc %d." % docid) |
125 | 140 | else: |
126 | 141 | logging.info("Discarded doc %d." % docid) |
127 | 142 | except: |
128 | 143 | logging.info("Doc %d not found in axi." % docid) |
129 | - logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." % | |
144 | + logging.info("Filter: %s" % terms) | |
145 | + logging.info("Index size: %d" % index.get_doccount()) | |
146 | + logging.info("Filtered Index size: %d (lastdocid: %d)." % | |
130 | 147 | (self.get_doccount(), self.get_lastdocid())) |
131 | 148 | |
132 | 149 | def __str__(self): |
... | ... | @@ -297,6 +314,80 @@ class PopconSubmission(): |
297 | 314 | elif data[4] == '<RECENT-CTIME>': |
298 | 315 | self.packages[pkg] = 8 |
299 | 316 | |
317 | +class FilteredPopconXapianIndex(xapian.WritableDatabase): | |
318 | + """ | |
319 | + Data source for popcon submissions defined as a xapian database. | |
320 | + """ | |
321 | + def __init__(self,path,popcon_dir,axi_path,tags_filter): | |
322 | + """ | |
323 | + Set initial attributes. | |
324 | + """ | |
325 | + self.axi = xapian.Database(axi_path) | |
326 | + self.path = os.path.expanduser(path) | |
327 | + self.popcon_dir = os.path.expanduser(popcon_dir) | |
328 | + self.valid_pkgs = axi_get_pkgs(self.axi) | |
329 | + logging.debug("Considering %d valid packages" % len(self.valid_pkgs)) | |
330 | + with open(tags_filter) as valid_tags: | |
331 | + self.valid_tags = [line.strip() for line in valid_tags | |
332 | + if not line.startswith("#")] | |
333 | + logging.debug("Considering %d valid tags" % len(self.valid_tags)) | |
334 | + if not os.path.exists(self.popcon_dir): | |
335 | + os.makedirs(self.popcon_dir) | |
336 | + if not os.listdir(self.popcon_dir): | |
337 | + logging.critical("Popcon dir seems to be empty.") | |
338 | + raise Error | |
339 | + | |
340 | + # set up directory | |
341 | + shutil.rmtree(self.path,1) | |
342 | + os.makedirs(self.path) | |
343 | + try: | |
344 | + logging.info("Indexing popcon submissions from \'%s\'" % | |
345 | + self.popcon_dir) | |
346 | + logging.info("Creating new xapian index at \'%s\'" % | |
347 | + self.path) | |
348 | + xapian.WritableDatabase.__init__(self,self.path, | |
349 | + xapian.DB_CREATE_OR_OVERWRITE) | |
350 | + except xapian.DatabaseError as e: | |
351 | + logging.critical("Could not create popcon xapian index.") | |
352 | + logging.critical(str(e)) | |
353 | + raise Error | |
354 | + | |
355 | + # build new index | |
356 | + doc_count = 0 | |
357 | + for root, dirs, files in os.walk(self.popcon_dir): | |
358 | + for popcon_file in files: | |
359 | + submission = PopconSubmission(os.path.join(root, popcon_file)) | |
360 | + doc = xapian.Document() | |
361 | + submission_pkgs = submission.get_filtered(self.valid_pkgs) | |
362 | + if len(submission_pkgs) < 10: | |
363 | + logging.debug("Low profile popcon submission \'%s\' (%d)" % | |
364 | + (submission.user_id,len(submission_pkgs))) | |
365 | + else: | |
366 | + doc.set_data(submission.user_id) | |
367 | + logging.debug("Parsing popcon submission \'%s\'" % | |
368 | + submission.user_id) | |
369 | + for pkg,freq in submission_pkgs.items(): | |
370 | + tags = axi_search_pkg_tags(self.axi,pkg) | |
371 | + # if the package was found in axi | |
372 | + if tags: | |
373 | + doc.add_term("XP"+pkg,freq) | |
374 | + # if the package has tags associated with it | |
375 | + if not tags == "notags": | |
376 | + for tag in tags: | |
377 | + if tag.lstrip("XT") in self.valid_tags: | |
378 | + doc.add_term(tag,freq) | |
379 | + doc_id = self.add_document(doc) | |
380 | + doc_count += 1 | |
381 | + logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) | |
382 | + # python garbage collector | |
383 | + gc.collect() | |
384 | + # flush to disk database changes | |
385 | + try: | |
386 | + self.commit() | |
387 | + except: | |
388 | + self.flush() # deprecated function, used for compatibility with old lib version | |
389 | + | |
390 | +# Deprecated class, must be reviewed | |
300 | 391 | class PopconXapianIndex(xapian.WritableDatabase): |
301 | 392 | """ |
302 | 393 | Data source for popcon submissions defined as a singleton xapian database. | ... | ... |