Commit f1b691fb888d2df5d82dc2aee04244c86b18b2ff

Authored by Tássia Camões Araújo
1 parent ccdace0e
Exists in master and in 1 other branch add_vagrant

Updated indexer scripts and data classes.

src/bin/indexer_axi.py 0 → 100755
... ... @@ -0,0 +1,77 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + indexer.py - generate xapian indexes to be used as items and users
  4 + repositories
  5 +"""
  6 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  7 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  8 +__license__ = """
  9 + This program is free software: you can redistribute it and/or modify
  10 + it under the terms of the GNU General Public License as published by
  11 + the Free Software Foundation, either version 3 of the License, or
  12 + (at your option) any later version.
  13 +
  14 + This program is distributed in the hope that it will be useful,
  15 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17 + GNU General Public License for more details.
  18 +
  19 + You should have received a copy of the GNU General Public License
  20 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  21 +"""
  22 +
  23 +import os
  24 +import sys
  25 +sys.path.insert(0,'../')
  26 +import datetime
  27 +
  28 +from config import Config
  29 +from error import Error
  30 +import data
  31 +import xapian
  32 +
  33 +if __name__ == '__main__':
  34 + axi_path = "/var/lib/apt-xapian-index/index"
  35 + axi = xapian.Database(axi_path)
  36 + base_dir = os.path.expanduser("~/.app-recommender/")
  37 +
  38 + begin_time = datetime.datetime.now()
  39 +
  40 + # axi sample based on the pkgs sample provided by command line
  41 + if "sample" in sys.argv:
  42 + print ("Sample package indexing started at %s" % begin_time)
  43 + if len(sys.argv) > 2:
  44 + pkgs_filter = sys.argv[2]
  45 + else:
  46 + print "Usage: indexer axi_sample pkgs_sample_file"
  47 + exit(1)
  48 + with open(pkgs_filter) as valid:
  49 + pkgs_list = [line.strip() for line in valid]
  50 + filter_str = pkgs_filter.split("/")[-1]
  51 + index = data.SampleAptXapianIndex(pkgs_list,axi,
  52 + os.path.join(base_dir,"axi_"+filter_str))
  53 + print ("Axi size: %d" % axi.get_doccount())
  54 + print ("Packages list length: %d" % len(pkgs_list))
  55 + print ("Sample index size: %d" %
  56 + index.get_doccount())
  57 +
  58 + # axi filtered by terms provided by command line
  59 + if "filter" in sys.argv:
  60 + print ("Filtered package indexing started at %s" % begin_time)
  61 + if len(sys.argv) > 2:
  62 + terms = sys.argv[2:]
  63 + else:
  64 + print ("Usage: indexer axi_filter term [additional terms]")
  65 + exit(1)
  66 + terms_str = "_".join([t.split("::")[-1] for t in terms])
  67 + index = data.FilteredXapianIndex(terms,axi,
  68 + os.path.join(base_dir,"axi_"+terms_str))
  69 + print ("Axi size: %d" % axi.get_doccount())
  70 + print ("Terms filter: %s" % terms)
  71 + print ("Filtered index size: %d" %
  72 + index.get_doccount())
  73 +
  74 + end_time = datetime.datetime.now()
  75 + print ("Indexing completed at %s" % end_time)
  76 + delta = end_time - begin_time
  77 + print ("Time elapsed: %d seconds." % delta.seconds)
... ...
src/bin/indexer_popcon.py 0 → 100755
... ... @@ -0,0 +1,52 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + popindex.py - generate a popcon index to be used by the recommender as the
  4 + users repository, based on filters provided by config
  5 +"""
  6 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  7 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  8 +__license__ = """
  9 + This program is free software: you can redistribute it and/or modify
  10 + it under the terms of the GNU General Public License as published by
  11 + the Free Software Foundation, either version 3 of the License, or
  12 + (at your option) any later version.
  13 +
  14 + This program is distributed in the hope that it will be useful,
  15 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17 + GNU General Public License for more details.
  18 +
  19 + You should have received a copy of the GNU General Public License
  20 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  21 +"""
  22 +import os
  23 +import sys
  24 +sys.path.insert(0,'../')
  25 +import logging
  26 +import datetime
  27 +
  28 +from config import Config
  29 +from data import FilteredPopconXapianIndex
  30 +
  31 +if __name__ == '__main__':
  32 + base_dir = os.path.expanduser("~/.app-recommender/")
  33 + axi_path = os.path.join(base_dir,"axi_XD")
  34 + path = os.path.join(base_dir,"popcon_XD")
  35 + popcon_dir = os.path.join(base_dir,"popcon-entries")
  36 + tags_filter = os.path.join(base_dir,"filters/debtags")
  37 +
  38 + # set up config for logging
  39 + cfg = Config()
  40 +
  41 + begin_time = datetime.datetime.now()
  42 + logging.info("Popcon indexing started at %s" % begin_time)
  43 + # use config file or command line options
  44 + index = FilteredPopconXapianIndex(path,popcon_dir,axi_path,tags_filter)
  45 +
  46 + end_time = datetime.datetime.now()
  47 + logging.info("Popcon indexing completed at %s" % end_time)
  48 + logging.info("Number of documents (submissions): %d" %
  49 + index.get_doccount())
  50 +
  51 + delta = end_time - begin_time
  52 + logging.info("Time elapsed: %d seconds." % delta.seconds)
... ...
src/data.py
... ... @@ -36,6 +36,18 @@ from singleton import Singleton
36 36 from dissimilarity import *
37 37 from config import Config
38 38  
  39 +def axi_get_pkgs(axi):
  40 + pkgs_names = []
  41 + for docid in range(1,axi.get_lastdocid()+1):
  42 + try:
  43 + doc = axi.get_document(docid)
  44 + except:
  45 + pass
  46 + docterms_XP = [t.term for t in doc.termlist()
  47 + if t.term.startswith("XP")]
  48 + pkgs_names.append(docterms_XP[0].lstrip('XP'))
  49 + return pkgs_names
  50 +
39 51 def axi_search_pkgs(axi,pkgs_list):
40 52 terms = ["XP"+item for item in pkgs_list]
41 53 query = xapian.Query(xapian.Query.OP_OR, terms)
... ... @@ -106,27 +118,32 @@ def tfidf_plus(index,docs,content_filter):
106 118 """
107 119 return tfidf_weighting(index,docs,content_filter,1)
108 120  
109   -class AppAptXapianIndex(xapian.WritableDatabase):
  121 +class FilteredXapianIndex(xapian.WritableDatabase):
110 122 """
111   - Data source for application packages information
  123 + Filtered Xapian Index
112 124 """
113   - def __init__(self,axi_path,path):
  125 + def __init__(self,terms,index_path,path):
114 126 xapian.WritableDatabase.__init__(self,path,
115 127 xapian.DB_CREATE_OR_OVERWRITE)
116   - axi = xapian.Database(axi_path)
117   - logging.info("AptXapianIndex size: %d" % axi.get_doccount())
118   - for docid in range(1,axi.get_lastdocid()+1):
  128 + index = xapian.Database(index_path)
  129 + for docid in range(1,index.get_lastdocid()+1):
119 130 try:
120   - doc = axi.get_document(docid)
121   - allterms = [term.term for term in doc.termlist()]
122   - if "XTrole::program" in allterms:
  131 + doc = index.get_document(docid)
  132 + docterms = [term.term for term in doc.termlist()]
  133 + tagged = False
  134 + for t in terms:
  135 + if t in docterms:
  136 + tagged = True
  137 + if tagged:
123 138 self.add_document(doc)
124 139 logging.info("Added doc %d." % docid)
125 140 else:
126 141 logging.info("Discarded doc %d." % docid)
127 142 except:
128 143 logging.info("Doc %d not found in axi." % docid)
129   - logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." %
  144 + logging.info("Filter: %s" % terms)
  145 + logging.info("Index size: %d" % index.get_doccount())
  146 + logging.info("Filtered Index size: %d (lastdocid: %d)." %
130 147 (self.get_doccount(), self.get_lastdocid()))
131 148  
132 149 def __str__(self):
... ... @@ -297,6 +314,80 @@ class PopconSubmission():
297 314 elif data[4] == '<RECENT-CTIME>':
298 315 self.packages[pkg] = 8
299 316  
  317 +class FilteredPopconXapianIndex(xapian.WritableDatabase):
  318 + """
  319 + Data source for popcon submissions defined as a xapian database.
  320 + """
  321 + def __init__(self,path,popcon_dir,axi_path,tags_filter):
  322 + """
  323 + Set initial attributes.
  324 + """
  325 + self.axi = xapian.Database(axi_path)
  326 + self.path = os.path.expanduser(path)
  327 + self.popcon_dir = os.path.expanduser(popcon_dir)
  328 + self.valid_pkgs = axi_get_pkgs(self.axi)
  329 + logging.debug("Considering %d valid packages" % len(self.valid_pkgs))
  330 + with open(tags_filter) as valid_tags:
  331 + self.valid_tags = [line.strip() for line in valid_tags
  332 + if not line.startswith("#")]
  333 + logging.debug("Considering %d valid tags" % len(self.valid_tags))
  334 + if not os.path.exists(self.popcon_dir):
  335 + os.makedirs(self.popcon_dir)
  336 + if not os.listdir(self.popcon_dir):
  337 + logging.critical("Popcon dir seems to be empty.")
  338 + raise Error
  339 +
  340 + # set up directory
  341 + shutil.rmtree(self.path,1)
  342 + os.makedirs(self.path)
  343 + try:
  344 + logging.info("Indexing popcon submissions from \'%s\'" %
  345 + self.popcon_dir)
  346 + logging.info("Creating new xapian index at \'%s\'" %
  347 + self.path)
  348 + xapian.WritableDatabase.__init__(self,self.path,
  349 + xapian.DB_CREATE_OR_OVERWRITE)
  350 + except xapian.DatabaseError as e:
  351 + logging.critical("Could not create popcon xapian index.")
  352 + logging.critical(str(e))
  353 + raise Error
  354 +
  355 + # build new index
  356 + doc_count = 0
  357 + for root, dirs, files in os.walk(self.popcon_dir):
  358 + for popcon_file in files:
  359 + submission = PopconSubmission(os.path.join(root, popcon_file))
  360 + doc = xapian.Document()
  361 + submission_pkgs = submission.get_filtered(self.valid_pkgs)
  362 + if len(submission_pkgs) < 10:
  363 + logging.debug("Low profile popcon submission \'%s\' (%d)" %
  364 + (submission.user_id,len(submission_pkgs)))
  365 + else:
  366 + doc.set_data(submission.user_id)
  367 + logging.debug("Parsing popcon submission \'%s\'" %
  368 + submission.user_id)
  369 + for pkg,freq in submission_pkgs.items():
  370 + tags = axi_search_pkg_tags(self.axi,pkg)
  371 + # if the package was found in axi
  372 + if tags:
  373 + doc.add_term("XP"+pkg,freq)
  374 + # if the package has tags associated with it
  375 + if not tags == "notags":
  376 + for tag in tags:
  377 + if tag.lstrip("XT") in self.valid_tags:
  378 + doc.add_term(tag,freq)
  379 + doc_id = self.add_document(doc)
  380 + doc_count += 1
  381 + logging.debug("Popcon Xapian: Indexing doc %d" % doc_id)
  382 + # python garbage collector
  383 + gc.collect()
  384 + # flush to disk database changes
  385 + try:
  386 + self.commit()
  387 + except:
  388 + self.flush() # deprecated function, used for compatibility with old lib version
  389 +
  390 +# Deprecated class, must be reviewed
300 391 class PopconXapianIndex(xapian.WritableDatabase):
301 392 """
302 393 Data source for popcon submissions defined as a singleton xapian database.
... ...