Commit faf4517619a3bf43b52305fbd3c196cadccfbf19
1 parent
cf2bfc08
Exists in
master
and in
1 other branch
New scripts for package data indexing and terms ranking by frequency.
Showing
2 changed files
with
85 additions
and
0 deletions
Show diff stats
... | ... | @@ -0,0 +1,58 @@ |
1 | +#!/usr/bin/env python | |
2 | +""" | |
3 | + Clustering - A python script to perform clustering of popcon data. | |
4 | +""" | |
5 | +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>" | |
6 | +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" | |
7 | +__license__ = """ | |
8 | + This program is free software: you can redistribute it and/or modify | |
9 | + it under the terms of the GNU General Public License as published by | |
10 | + the Free Software Foundation, either version 3 of the License, or | |
11 | + (at your option) any later version. | |
12 | + | |
13 | + This program is distributed in the hope that it will be useful, | |
14 | + but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | + GNU General Public License for more details. | |
17 | + | |
18 | + You should have received a copy of the GNU General Public License | |
19 | + along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +""" | |
21 | +import sys | |
22 | +sys.path.insert(0,'../') | |
23 | +import logging | |
24 | +import datetime | |
25 | +from datetime import timedelta | |
26 | + | |
27 | +from config import Config | |
28 | +from error import Error | |
29 | +import data | |
30 | +import xapian | |
31 | + | |
32 | +if __name__ == '__main__': | |
33 | + cfg = Config() | |
34 | + begin_time = datetime.datetime.now() | |
35 | + if len(sys.argv) >= 3: | |
36 | + try: | |
37 | + with open(sys.argv[2]) as valid: | |
38 | + pkgs_list = [line.strip() for line in valid] | |
39 | + logging.info("Packages list length: %d" % len(pkgs_list)) | |
40 | + except: | |
41 | + logging.critical("File %s does not seem to be a package \ | |
42 | + list" % sys.argv[2]) | |
43 | + raise Error | |
44 | + pkgs_index = data.SampleAptXapianIndex(pkgs_list,xapian.Database(cfg.axi), | |
45 | + sys.argv[1]) | |
46 | + try: | |
47 | + logging.info("Sample package indexing started at %s" % begin_time) | |
48 | + except: | |
49 | + logging.critical("Could not create the index at %s" % sys.argv[1]) | |
50 | + raise Error | |
51 | + | |
52 | + end_time = datetime.datetime.now() | |
53 | + print("Sample package indexing completed at %s" % end_time) | |
54 | + print("Number of documents: %d" % pkgs_index.get_doccount()) | |
55 | + delta = end_time - begin_time | |
56 | + logging.info("Time elapsed: %d seconds." % delta.seconds) | |
57 | + else: | |
58 | + logging.critical("Usage: pkgindex.py INDEX_PATH PKGS_LIST") | ... | ... |
... | ... | @@ -0,0 +1,27 @@ |
1 | +#!/usr/bin/env python | |
2 | + | |
3 | +import xapian | |
4 | +import os | |
5 | +from operator import itemgetter | |
6 | +import sys | |
7 | + | |
8 | +if __name__ == '__main__': | |
9 | + if "-h" in sys.argv or not len(sys.argv) == 4: | |
10 | + print "\nUsage: rank_terms.py INDEX TERMS_FILE PREFIX\n" | |
11 | + else: | |
12 | + try: | |
13 | + index = xapian.Database(sys.argv[1]) | |
14 | + except: | |
15 | + print "Could no open xapian index at %s" % sys.argv[1] | |
16 | + try: | |
17 | + with open(sys.argv[2]) as terms_file: | |
18 | + terms_list = [line.strip() for line in terms_file] | |
19 | + print terms_list | |
20 | + frequencies = {} | |
21 | + for term in terms_list: | |
22 | + frequencies[term] = index.get_termfreq(sys.argv[3]+term) | |
23 | + sorted_freqs = sorted(frequencies.items(), key=itemgetter(1)) | |
24 | + except: | |
25 | + print "Could not extract terms list from %s" % sys.argv[2] | |
26 | + for term,freq in sorted_freqs: | |
27 | + print term,str(freq) | ... | ... |