From faf4517619a3bf43b52305fbd3c196cadccfbf19 Mon Sep 17 00:00:00 2001 From: Tássia Camões Araújo Date: Wed, 3 Aug 2011 03:00:20 +0000 Subject: [PATCH] New scripts for package data indexing and terms ranking by frequency. --- src/bin/pkgindex.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/bin/rank_terms.py | 27 +++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 0 deletions(-) create mode 100755 src/bin/pkgindex.py create mode 100755 src/bin/rank_terms.py diff --git a/src/bin/pkgindex.py b/src/bin/pkgindex.py new file mode 100755 index 0000000..94b7ed8 --- /dev/null +++ b/src/bin/pkgindex.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +""" + Clustering - A python script to perform clustering of popcon data. +""" +__author__ = "Tassia Camoes Araujo " +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo" +__license__ = """ + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +""" +import sys +sys.path.insert(0,'../') +import logging +import datetime +from datetime import timedelta + +from config import Config +from error import Error +import data +import xapian + +if __name__ == '__main__': + cfg = Config() + begin_time = datetime.datetime.now() + if len(sys.argv) >= 3: + try: + with open(sys.argv[2]) as valid: + pkgs_list = [line.strip() for line in valid] + logging.info("Packages list length: %d" % len(pkgs_list)) + except: + logging.critical("File %s does not seem to be a package \ + list" % sys.argv[2]) + raise Error + pkgs_index = data.SampleAptXapianIndex(pkgs_list,xapian.Database(cfg.axi), + sys.argv[1]) + try: + logging.info("Sample package indexing started at %s" % begin_time) + except: + logging.critical("Could not create the index at %s" % sys.argv[1]) + raise Error + + end_time = datetime.datetime.now() + print("Sample package indexing completed at %s" % end_time) + print("Number of documents: %d" % pkgs_index.get_doccount()) + delta = end_time - begin_time + logging.info("Time elapsed: %d seconds." % delta.seconds) + else: + logging.critical("Usage: pkgindex.py INDEX_PATH PKGS_LIST") diff --git a/src/bin/rank_terms.py b/src/bin/rank_terms.py new file mode 100755 index 0000000..37cc42e --- /dev/null +++ b/src/bin/rank_terms.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python + +import xapian +import os +from operator import itemgetter +import sys + +if __name__ == '__main__': + if "-h" in sys.argv or not len(sys.argv) == 4: + print "\nUsage: rank_terms.py INDEX TERMS_FILE PREFIX\n" + else: + try: + index = xapian.Database(sys.argv[1]) + except: + print "Could no open xapian index at %s" % sys.argv[1] + try: + with open(sys.argv[2]) as terms_file: + terms_list = [line.strip() for line in terms_file] + print terms_list + frequencies = {} + for term in terms_list: + frequencies[term] = index.get_termfreq(sys.argv[3]+term) + sorted_freqs = sorted(frequencies.items(), key=itemgetter(1)) + except: + print "Could not extract terms list from %s" % sys.argv[2] + for term,freq in sorted_freqs: + print term,str(freq) -- libgit2 0.21.2