Commit faf4517619a3bf43b52305fbd3c196cadccfbf19

Authored by Tássia Camões Araújo
1 parent cf2bfc08
Exists in master and in 1 other branch add_vagrant

New scripts for package data indexing and terms ranking by frequency.

Showing 2 changed files with 85 additions and 0 deletions   Show diff stats
src/bin/pkgindex.py 0 → 100755
... ... @@ -0,0 +1,58 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + Clustering - A python script to perform clustering of popcon data.
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +import sys
  22 +sys.path.insert(0,'../')
  23 +import logging
  24 +import datetime
  25 +from datetime import timedelta
  26 +
  27 +from config import Config
  28 +from error import Error
  29 +import data
  30 +import xapian
  31 +
  32 +if __name__ == '__main__':
  33 + cfg = Config()
  34 + begin_time = datetime.datetime.now()
  35 + if len(sys.argv) >= 3:
  36 + try:
  37 + with open(sys.argv[2]) as valid:
  38 + pkgs_list = [line.strip() for line in valid]
  39 + logging.info("Packages list length: %d" % len(pkgs_list))
  40 + except:
  41 + logging.critical("File %s does not seem to be a package \
  42 + list" % sys.argv[2])
  43 + raise Error
  44 + pkgs_index = data.SampleAptXapianIndex(pkgs_list,xapian.Database(cfg.axi),
  45 + sys.argv[1])
  46 + try:
  47 + logging.info("Sample package indexing started at %s" % begin_time)
  48 + except:
  49 + logging.critical("Could not create the index at %s" % sys.argv[1])
  50 + raise Error
  51 +
  52 + end_time = datetime.datetime.now()
  53 + print("Sample package indexing completed at %s" % end_time)
  54 + print("Number of documents: %d" % pkgs_index.get_doccount())
  55 + delta = end_time - begin_time
  56 + logging.info("Time elapsed: %d seconds." % delta.seconds)
  57 + else:
  58 + logging.critical("Usage: pkgindex.py INDEX_PATH PKGS_LIST")
... ...
src/bin/rank_terms.py 0 → 100755
... ... @@ -0,0 +1,27 @@
  1 +#!/usr/bin/env python
  2 +
  3 +import xapian
  4 +import os
  5 +from operator import itemgetter
  6 +import sys
  7 +
  8 +if __name__ == '__main__':
  9 + if "-h" in sys.argv or not len(sys.argv) == 4:
  10 + print "\nUsage: rank_terms.py INDEX TERMS_FILE PREFIX\n"
  11 + else:
  12 + try:
  13 + index = xapian.Database(sys.argv[1])
  14 + except:
  15 + print "Could no open xapian index at %s" % sys.argv[1]
  16 + try:
  17 + with open(sys.argv[2]) as terms_file:
  18 + terms_list = [line.strip() for line in terms_file]
  19 + print terms_list
  20 + frequencies = {}
  21 + for term in terms_list:
  22 + frequencies[term] = index.get_termfreq(sys.argv[3]+term)
  23 + sorted_freqs = sorted(frequencies.items(), key=itemgetter(1))
  24 + except:
  25 + print "Could not extract terms list from %s" % sys.argv[2]
  26 + for term,freq in sorted_freqs:
  27 + print term,str(freq)
... ...