Commit f1b691fb888d2df5d82dc2aee04244c86b18b2ff
1 parent
ccdace0e
Exists in
master
and in
1 other branch
Updated indexer scripts and data classes.
Showing
3 changed files
with
230 additions
and
10 deletions
Show diff stats
@@ -0,0 +1,77 @@ | @@ -0,0 +1,77 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + indexer.py - generate xapian indexes to be used as items and users | ||
4 | + repositories | ||
5 | +""" | ||
6 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
7 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
8 | +__license__ = """ | ||
9 | + This program is free software: you can redistribute it and/or modify | ||
10 | + it under the terms of the GNU General Public License as published by | ||
11 | + the Free Software Foundation, either version 3 of the License, or | ||
12 | + (at your option) any later version. | ||
13 | + | ||
14 | + This program is distributed in the hope that it will be useful, | ||
15 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | + GNU General Public License for more details. | ||
18 | + | ||
19 | + You should have received a copy of the GNU General Public License | ||
20 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
21 | +""" | ||
22 | + | ||
23 | +import os | ||
24 | +import sys | ||
25 | +sys.path.insert(0,'../') | ||
26 | +import datetime | ||
27 | + | ||
28 | +from config import Config | ||
29 | +from error import Error | ||
30 | +import data | ||
31 | +import xapian | ||
32 | + | ||
33 | +if __name__ == '__main__': | ||
34 | + axi_path = "/var/lib/apt-xapian-index/index" | ||
35 | + axi = xapian.Database(axi_path) | ||
36 | + base_dir = os.path.expanduser("~/.app-recommender/") | ||
37 | + | ||
38 | + begin_time = datetime.datetime.now() | ||
39 | + | ||
40 | + # axi sample based on the pkgs sample provided by command line | ||
41 | + if "sample" in sys.argv: | ||
42 | + print ("Sample package indexing started at %s" % begin_time) | ||
43 | + if len(sys.argv) > 2: | ||
44 | + pkgs_filter = sys.argv[2] | ||
45 | + else: | ||
46 | + print "Usage: indexer axi_sample pkgs_sample_file" | ||
47 | + exit(1) | ||
48 | + with open(pkgs_filter) as valid: | ||
49 | + pkgs_list = [line.strip() for line in valid] | ||
50 | + filter_str = pkgs_filter.split("/")[-1] | ||
51 | + index = data.SampleAptXapianIndex(pkgs_list,axi, | ||
52 | + os.path.join(base_dir,"axi_"+filter_str)) | ||
53 | + print ("Axi size: %d" % axi.get_doccount()) | ||
54 | + print ("Packages list length: %d" % len(pkgs_list)) | ||
55 | + print ("Sample index size: %d" % | ||
56 | + index.get_doccount()) | ||
57 | + | ||
58 | + # axi filtered by terms provided by command line | ||
59 | + if "filter" in sys.argv: | ||
60 | + print ("Filtered package indexing started at %s" % begin_time) | ||
61 | + if len(sys.argv) > 2: | ||
62 | + terms = sys.argv[2:] | ||
63 | + else: | ||
64 | + print ("Usage: indexer axi_filter term [additional terms]") | ||
65 | + exit(1) | ||
66 | + terms_str = "_".join([t.split("::")[-1] for t in terms]) | ||
67 | + index = data.FilteredXapianIndex(terms,axi, | ||
68 | + os.path.join(base_dir,"axi_"+terms_str)) | ||
69 | + print ("Axi size: %d" % axi.get_doccount()) | ||
70 | + print ("Terms filter: %s" % terms) | ||
71 | + print ("Filtered index size: %d" % | ||
72 | + index.get_doccount()) | ||
73 | + | ||
74 | + end_time = datetime.datetime.now() | ||
75 | + print ("Indexing completed at %s" % end_time) | ||
76 | + delta = end_time - begin_time | ||
77 | + print ("Time elapsed: %d seconds." % delta.seconds) |
@@ -0,0 +1,52 @@ | @@ -0,0 +1,52 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +""" | ||
3 | + popindex.py - generate a popcon index to be used by the recommender as the | ||
4 | + users repository, based on filters provided by config | ||
5 | +""" | ||
6 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | ||
7 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | ||
8 | +__license__ = """ | ||
9 | + This program is free software: you can redistribute it and/or modify | ||
10 | + it under the terms of the GNU General Public License as published by | ||
11 | + the Free Software Foundation, either version 3 of the License, or | ||
12 | + (at your option) any later version. | ||
13 | + | ||
14 | + This program is distributed in the hope that it will be useful, | ||
15 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | + GNU General Public License for more details. | ||
18 | + | ||
19 | + You should have received a copy of the GNU General Public License | ||
20 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
21 | +""" | ||
22 | +import os | ||
23 | +import sys | ||
24 | +sys.path.insert(0,'../') | ||
25 | +import logging | ||
26 | +import datetime | ||
27 | + | ||
28 | +from config import Config | ||
29 | +from data import FilteredPopconXapianIndex | ||
30 | + | ||
31 | +if __name__ == '__main__': | ||
32 | + base_dir = os.path.expanduser("~/.app-recommender/") | ||
33 | + axi_path = os.path.join(base_dir,"axi_XD") | ||
34 | + path = os.path.join(base_dir,"popcon_XD") | ||
35 | + popcon_dir = os.path.join(base_dir,"popcon-entries") | ||
36 | + tags_filter = os.path.join(base_dir,"filters/debtags") | ||
37 | + | ||
38 | + # set up config for logging | ||
39 | + cfg = Config() | ||
40 | + | ||
41 | + begin_time = datetime.datetime.now() | ||
42 | + logging.info("Popcon indexing started at %s" % begin_time) | ||
43 | + # use config file or command line options | ||
44 | + index = FilteredPopconXapianIndex(path,popcon_dir,axi_path,tags_filter) | ||
45 | + | ||
46 | + end_time = datetime.datetime.now() | ||
47 | + logging.info("Popcon indexing completed at %s" % end_time) | ||
48 | + logging.info("Number of documents (submissions): %d" % | ||
49 | + index.get_doccount()) | ||
50 | + | ||
51 | + delta = end_time - begin_time | ||
52 | + logging.info("Time elapsed: %d seconds." % delta.seconds) |
src/data.py
@@ -36,6 +36,18 @@ from singleton import Singleton | @@ -36,6 +36,18 @@ from singleton import Singleton | ||
36 | from dissimilarity import * | 36 | from dissimilarity import * |
37 | from config import Config | 37 | from config import Config |
38 | 38 | ||
39 | +def axi_get_pkgs(axi): | ||
40 | + pkgs_names = [] | ||
41 | + for docid in range(1,axi.get_lastdocid()+1): | ||
42 | + try: | ||
43 | + doc = axi.get_document(docid) | ||
44 | + except: | ||
45 | + pass | ||
46 | + docterms_XP = [t.term for t in doc.termlist() | ||
47 | + if t.term.startswith("XP")] | ||
48 | + pkgs_names.append(docterms_XP[0].lstrip('XP')) | ||
49 | + return pkgs_names | ||
50 | + | ||
39 | def axi_search_pkgs(axi,pkgs_list): | 51 | def axi_search_pkgs(axi,pkgs_list): |
40 | terms = ["XP"+item for item in pkgs_list] | 52 | terms = ["XP"+item for item in pkgs_list] |
41 | query = xapian.Query(xapian.Query.OP_OR, terms) | 53 | query = xapian.Query(xapian.Query.OP_OR, terms) |
@@ -106,27 +118,32 @@ def tfidf_plus(index,docs,content_filter): | @@ -106,27 +118,32 @@ def tfidf_plus(index,docs,content_filter): | ||
106 | """ | 118 | """ |
107 | return tfidf_weighting(index,docs,content_filter,1) | 119 | return tfidf_weighting(index,docs,content_filter,1) |
108 | 120 | ||
109 | -class AppAptXapianIndex(xapian.WritableDatabase): | 121 | +class FilteredXapianIndex(xapian.WritableDatabase): |
110 | """ | 122 | """ |
111 | - Data source for application packages information | 123 | + Filtered Xapian Index |
112 | """ | 124 | """ |
113 | - def __init__(self,axi_path,path): | 125 | + def __init__(self,terms,index_path,path): |
114 | xapian.WritableDatabase.__init__(self,path, | 126 | xapian.WritableDatabase.__init__(self,path, |
115 | xapian.DB_CREATE_OR_OVERWRITE) | 127 | xapian.DB_CREATE_OR_OVERWRITE) |
116 | - axi = xapian.Database(axi_path) | ||
117 | - logging.info("AptXapianIndex size: %d" % axi.get_doccount()) | ||
118 | - for docid in range(1,axi.get_lastdocid()+1): | 128 | + index = xapian.Database(index_path) |
129 | + for docid in range(1,index.get_lastdocid()+1): | ||
119 | try: | 130 | try: |
120 | - doc = axi.get_document(docid) | ||
121 | - allterms = [term.term for term in doc.termlist()] | ||
122 | - if "XTrole::program" in allterms: | 131 | + doc = index.get_document(docid) |
132 | + docterms = [term.term for term in doc.termlist()] | ||
133 | + tagged = False | ||
134 | + for t in terms: | ||
135 | + if t in docterms: | ||
136 | + tagged = True | ||
137 | + if tagged: | ||
123 | self.add_document(doc) | 138 | self.add_document(doc) |
124 | logging.info("Added doc %d." % docid) | 139 | logging.info("Added doc %d." % docid) |
125 | else: | 140 | else: |
126 | logging.info("Discarded doc %d." % docid) | 141 | logging.info("Discarded doc %d." % docid) |
127 | except: | 142 | except: |
128 | logging.info("Doc %d not found in axi." % docid) | 143 | logging.info("Doc %d not found in axi." % docid) |
129 | - logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." % | 144 | + logging.info("Filter: %s" % terms) |
145 | + logging.info("Index size: %d" % index.get_doccount()) | ||
146 | + logging.info("Filtered Index size: %d (lastdocid: %d)." % | ||
130 | (self.get_doccount(), self.get_lastdocid())) | 147 | (self.get_doccount(), self.get_lastdocid())) |
131 | 148 | ||
132 | def __str__(self): | 149 | def __str__(self): |
@@ -297,6 +314,80 @@ class PopconSubmission(): | @@ -297,6 +314,80 @@ class PopconSubmission(): | ||
297 | elif data[4] == '<RECENT-CTIME>': | 314 | elif data[4] == '<RECENT-CTIME>': |
298 | self.packages[pkg] = 8 | 315 | self.packages[pkg] = 8 |
299 | 316 | ||
317 | +class FilteredPopconXapianIndex(xapian.WritableDatabase): | ||
318 | + """ | ||
319 | + Data source for popcon submissions defined as a xapian database. | ||
320 | + """ | ||
321 | + def __init__(self,path,popcon_dir,axi_path,tags_filter): | ||
322 | + """ | ||
323 | + Set initial attributes. | ||
324 | + """ | ||
325 | + self.axi = xapian.Database(axi_path) | ||
326 | + self.path = os.path.expanduser(path) | ||
327 | + self.popcon_dir = os.path.expanduser(popcon_dir) | ||
328 | + self.valid_pkgs = axi_get_pkgs(self.axi) | ||
329 | + logging.debug("Considering %d valid packages" % len(self.valid_pkgs)) | ||
330 | + with open(tags_filter) as valid_tags: | ||
331 | + self.valid_tags = [line.strip() for line in valid_tags | ||
332 | + if not line.startswith("#")] | ||
333 | + logging.debug("Considering %d valid tags" % len(self.valid_tags)) | ||
334 | + if not os.path.exists(self.popcon_dir): | ||
335 | + os.makedirs(self.popcon_dir) | ||
336 | + if not os.listdir(self.popcon_dir): | ||
337 | + logging.critical("Popcon dir seems to be empty.") | ||
338 | + raise Error | ||
339 | + | ||
340 | + # set up directory | ||
341 | + shutil.rmtree(self.path,1) | ||
342 | + os.makedirs(self.path) | ||
343 | + try: | ||
344 | + logging.info("Indexing popcon submissions from \'%s\'" % | ||
345 | + self.popcon_dir) | ||
346 | + logging.info("Creating new xapian index at \'%s\'" % | ||
347 | + self.path) | ||
348 | + xapian.WritableDatabase.__init__(self,self.path, | ||
349 | + xapian.DB_CREATE_OR_OVERWRITE) | ||
350 | + except xapian.DatabaseError as e: | ||
351 | + logging.critical("Could not create popcon xapian index.") | ||
352 | + logging.critical(str(e)) | ||
353 | + raise Error | ||
354 | + | ||
355 | + # build new index | ||
356 | + doc_count = 0 | ||
357 | + for root, dirs, files in os.walk(self.popcon_dir): | ||
358 | + for popcon_file in files: | ||
359 | + submission = PopconSubmission(os.path.join(root, popcon_file)) | ||
360 | + doc = xapian.Document() | ||
361 | + submission_pkgs = submission.get_filtered(self.valid_pkgs) | ||
362 | + if len(submission_pkgs) < 10: | ||
363 | + logging.debug("Low profile popcon submission \'%s\' (%d)" % | ||
364 | + (submission.user_id,len(submission_pkgs))) | ||
365 | + else: | ||
366 | + doc.set_data(submission.user_id) | ||
367 | + logging.debug("Parsing popcon submission \'%s\'" % | ||
368 | + submission.user_id) | ||
369 | + for pkg,freq in submission_pkgs.items(): | ||
370 | + tags = axi_search_pkg_tags(self.axi,pkg) | ||
371 | + # if the package was found in axi | ||
372 | + if tags: | ||
373 | + doc.add_term("XP"+pkg,freq) | ||
374 | + # if the package has tags associated with it | ||
375 | + if not tags == "notags": | ||
376 | + for tag in tags: | ||
377 | + if tag.lstrip("XT") in self.valid_tags: | ||
378 | + doc.add_term(tag,freq) | ||
379 | + doc_id = self.add_document(doc) | ||
380 | + doc_count += 1 | ||
381 | + logging.debug("Popcon Xapian: Indexing doc %d" % doc_id) | ||
382 | + # python garbage collector | ||
383 | + gc.collect() | ||
384 | + # flush to disk database changes | ||
385 | + try: | ||
386 | + self.commit() | ||
387 | + except: | ||
388 | + self.flush() # deprecated function, used for compatibility with old lib version | ||
389 | + | ||
390 | +# Deprecated class, must be reviewed | ||
300 | class PopconXapianIndex(xapian.WritableDatabase): | 391 | class PopconXapianIndex(xapian.WritableDatabase): |
301 | """ | 392 | """ |
302 | Data source for popcon submissions defined as a singleton xapian database. | 393 | Data source for popcon submissions defined as a singleton xapian database. |