Merge branch 'master' of github.com:tassia/AppRecommender

Tássia Camões Araújo
2 parents b9ecf615 c673b9b2
Showing 28 changed files with 1920 additions and 62 deletions Show diff stats
src/bin/cross_validation.py
src/bin/get_axipkgs.py
src/bin/get_desktop.sh
src/bin/get_pkgs_inst.py
src/bin/indexer_axi.py
src/bin/indexer_popcon.py
src/config.py
src/data.py
src/evaluation.py
src/experiments/deprecated/README
src/experiments/deprecated/clustering-suite.py
src/experiments/deprecated/experiments.cfg
src/experiments/deprecated/runner.py
src/experiments/extract-sample-db.py
src/experiments/hybrid.py
src/experiments/k-suite.py
src/experiments/legacy/clustering-suite.py
src/experiments/legacy/experiments.cfg
src/experiments/legacy/runner.py
src/experiments/popcon-population.py
@@ -37,7 +37,7 @@ if __name__ == &#39;__main__&#39;:
     #user = LocalSystem()
     #user = RandomPopcon(cfg.popcon_dir)
     #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
-    user = PopconSystem("/home/tassia/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
+    user = PopconSystem(os.path.expanduser("~/.app-recommender/popcon-entries/00/0001166d0737c6dffb083071e5ee69f5"))
     user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps"))
     user.maximal_pkg_profile()
     begin_time = datetime.datetime.now()
@@ -48,7 +48,7 @@ if __name__ == &#39;__main__&#39;:
     metrics.append(F_score(0.5))
     metrics.append(Accuracy())
     metrics.append(FPR())
-    validation = CrossValidation(0.9,10,rec,metrics,1)
+    validation = CrossValidation(0.9,20,rec,metrics,0.005)
     validation.run(user)
     print validation
  
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+"""
+    AppRecommender - A GNU/Linux application recommender
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import os
+import sys
+sys.path.insert(0,'../')
+import xapian
+
+if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: get_axipkgs index_path"
+        exit(1)
+
+    axi_path = sys.argv[1]
+    axi = xapian.Database(axi_path)
+    for n in range(1,axi.get_lastdocid()):
+        doc = 0
+        try:
+            doc = axi.get_document(n)
+        except:
+            pass
+        if doc:
+            xp_terms = [t.term for t in doc.termlist() if t.term.startswith("XP")]
+            print xp_terms[0].lstrip('XP')
 #!/usr/bin/env bash
 #
-# get_desktop.sh - get packages which have desktop files 
+# get_desktop.sh - get packages which have desktop files
+#
+# DEPRECATED: use get_axipkgs.py to get this info from axi
  
 cd /usr/share/app-install/desktop
 sed -ne 's/X-AppInstall-Package=//p' * | sort -u | grep -v kdelibs | grep -v libfm-gtk0
 #!/usr/bin/env python
 #
 # get_pkgs_inst.py - get tuple (package,installation) from popcon results file
+#
+# results_file: org/popcon.debian.org/popcon-mail/results
  
+import sys
 from operator import itemgetter
+
 if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: get_pkgs_inst popcon_results_path"
+        exit(1)
+
+    results_path = sys.argv[1]
     pkgs_inst = {}
-    with open("/root/org/popcon.debian.org/popcon-mail/results") as results:
+    with open(results_path) as results:
         for line in results:
             if line.startswith("Package"):
                 fields = line.split()
                 inst = int(fields[2])+int(fields[3])+int(fields[4])
-                if inst > 20:
-                    pkgs_inst[fields[1]] = inst
+                pkgs_inst[fields[1]] = inst
     sorted_by_inst = sorted(pkgs_inst.items(), key=itemgetter(1))
     for pkg, inst in sorted_by_inst:
         print pkg, inst
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+"""
+    indexer.py - generate xapian indexes to be used as items and users
+                 repositories
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import os
+import sys
+sys.path.insert(0,'../')
+import datetime
+
+from config import Config
+from error import Error
+import data
+import xapian
+
+if __name__ == '__main__':
+    axi_path = "/var/lib/apt-xapian-index/index"
+    axi = xapian.Database(axi_path)
+    base_dir = os.path.expanduser("~/.app-recommender/")
+
+    begin_time = datetime.datetime.now()
+
+    # axi sample based on the pkgs sample provided by command line
+    if "sample" in sys.argv:
+        print ("Sample package indexing started at %s" % begin_time)
+        if len(sys.argv) > 2:
+            pkgs_filter = sys.argv[2]
+        else:
+            print "Usage: indexer axi_sample pkgs_sample_file"
+            exit(1)
+        with open(pkgs_filter) as valid:
+            pkgs_list = [line.strip() for line in valid]
+        filter_str = pkgs_filter.split("/")[-1]
+        index = data.SampleAptXapianIndex(pkgs_list,axi,
+                                          os.path.join(base_dir,"axi_"+filter_str))
+        print ("Axi size: %d" % axi.get_doccount())
+        print ("Packages list length: %d" % len(pkgs_list))
+        print ("Sample index size: %d" %
+                     index.get_doccount())
+
+    # axi filtered by terms provided by command line
+    if "filter" in sys.argv:
+        print ("Filtered package indexing started at %s" % begin_time)
+        if len(sys.argv) > 2:
+            terms = sys.argv[2:]
+        else:
+            print ("Usage: indexer axi_filter term [additional terms]")
+            exit(1)
+        terms_str = "_".join([t.split("::")[-1] for t in terms])
+        index = data.FilteredXapianIndex(terms,axi,
+                                         os.path.join(base_dir,"axi_"+terms_str))
+        print ("Axi size: %d" % axi.get_doccount())
+        print ("Terms filter: %s" % terms)
+        print ("Filtered index size: %d" %
+                     index.get_doccount())
+
+    end_time = datetime.datetime.now()
+    print ("Indexing completed at %s" % end_time)
+    delta = end_time - begin_time
+    print ("Time elapsed: %d seconds." % delta.seconds)
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+"""
+    popindex.py - generate a popcon index to be used by the recommender as the
+                  users repository, based on filters provided by config
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+import os
+import sys
+sys.path.insert(0,'../')
+import logging
+import datetime
+
+from config import Config
+from data import FilteredPopconXapianIndex
+
+if __name__ == '__main__':
+    base_dir = os.path.expanduser("~/.app-recommender/")
+    axi_path = os.path.join(base_dir,"axi_XD")
+    path = os.path.join(base_dir,"popcon_XD")
+    popcon_dir = os.path.join(base_dir,"popcon-entries")
+    tags_filter = os.path.join(base_dir,"filters/debtags")
+
+    # set up config for logging
+    cfg = Config()
+
+    begin_time = datetime.datetime.now()
+    logging.info("Popcon indexing started at %s" % begin_time)
+    # use config file or command line options
+    index = FilteredPopconXapianIndex(path,popcon_dir,axi_path,tags_filter)
+
+    end_time = datetime.datetime.now()
+    logging.info("Popcon indexing completed at %s" % end_time)
+    logging.info("Number of documents (submissions): %d" %
+                 index.get_doccount())
+
+    delta = end_time - begin_time
+    logging.info("Time elapsed: %d seconds." % delta.seconds)
@@ -40,7 +40,7 @@ class Config(Singleton):
             ## general options
             self.debug = 0
             self.verbose = 1
-            self.output = "log"
+            self.output = "apprec.log"
  
             ## data_source options
             self.base_dir = os.path.expanduser("~/.app-recommender/")
@@ -103,13 +103,14 @@ class Config(Singleton):
         print "  -f, --filtersdir=PATH      Path to filters directory"
         print "  -b, --pkgsfilter=FILTER    File containing packages to be considered for recommendations"
         print "  -a, --axi=PATH             Path to apt-xapian-index"
-        print "  -e, --dde=URL              DDE url"
         print "  -p, --popconindex=PATH     Path to popcon index"
-        print "  -m, --popcondir=PATH       Path to popcon submissions dir"
-        print "  -u, --indexmode=MODE       'old'|'reindex'|'cluster'|'recluster'"
-        print "  -l, --clustersdir=PATH     Path to popcon clusters dir"
-        print "  -c, --medoids=k            Number of medoids for clustering"
-        print "  -x, --maxpopcon=k          Number of submissions to be considered"
+        print "  -e, --dde=URL              DDE url"
+        # deprecated options
+        #print "  -m, --popcondir=PATH       Path to popcon submissions dir"
+        #print "  -u, --indexmode=MODE       'old'|'reindex'|'cluster'|'recluster'"
+        #print "  -l, --clustersdir=PATH     Path to popcon clusters dir"
+        #print "  -c, --medoids=k            Number of medoids for clustering"
+        #print "  -x, --maxpopcon=k          Number of submissions to be considered"
         print ""
         print " [ recommender ]"
         print "  -w, --weight=OPTION        Search weighting scheme"
@@ -123,11 +124,19 @@ class Config(Singleton):
         print "  bm25 = bm25 weighting scheme"
         print ""
         print " [ strategy options ] "
-        print "  cb = content-based "
-        print "  cbt = content-based using only tags as content "
-        print "  cbd = content-based using only package descriptions as content "
-        print "  col = collaborative "
-        print "  colct = collaborative through tags content "
+        print "  cb = content-based, mixed profile"
+        print "  cbt = content-based, tags only profile"
+        print "  cbd = content-based, description terms only profile"
+        print "  cbh = content-based, half-half profile"
+        print "  cb_eset = cb with eset profiling"
+        print "  cbt_eset = cbt with eset profiling"
+        print "  cbd_eset = cbd_eset with eset profiling"
+        print "  cbh_eset = cbh with eset profiling"
+        print "  knn = collaborative, tf-idf knn"
+        print "  knn_plus = collaborative, tf-idf weighted knn"
+        print "  knn_eset = collaborative, eset knn"
+        print "  knnco = collaborative through content"
+        print "  knnco_eset = collaborative through content, eset recommendation"
  
     def read_option(self, section, option):
         """
@@ -30,12 +30,26 @@ import shutil
 import apt
 import re
 import operator
+import urllib
+import simplejson as json
  
 from error import Error
 from singleton import Singleton
 from dissimilarity import *
 from config import Config
  
+def axi_get_pkgs(axi):
+    pkgs_names = []
+    for docid in range(1,axi.get_lastdocid()+1):
+        try:
+            doc = axi.get_document(docid)
+        except:
+            pass
+        docterms_XP = [t.term for t in doc.termlist()
+                       if t.term.startswith("XP")]
+        pkgs_names.append(docterms_XP[0].lstrip('XP'))
+    return pkgs_names
+
 def axi_search_pkgs(axi,pkgs_list):
     terms = ["XP"+item for item in pkgs_list]
     query = xapian.Query(xapian.Query.OP_OR, terms)
@@ -110,30 +124,39 @@ def tfidf_plus(index,docs,content_filter):
     variance = sum([(p-mean)*(p-mean) for p in population])/len(population)
     standard_deviation = math.sqrt(variance)
     for d in docs:
-        normalized_weigths[d.docid] = d.weight/standard_deviation
+        if standard_deviation>1:
+            # values between [0-1] would cause the opposite effect
+            normalized_weigths[d.docid] = d.weight/standard_deviation
+        else:
+            normalized_weigths[d.docid] = d.weight
     return tfidf_weighting(index,docs,content_filter,normalized_weigths)
  
-class AppAptXapianIndex(xapian.WritableDatabase):
+class FilteredXapianIndex(xapian.WritableDatabase):
     """
-    Data source for application packages information
+    Filtered Xapian Index
     """
-    def __init__(self,axi_path,path):
+    def __init__(self,terms,index_path,path):
         xapian.WritableDatabase.__init__(self,path,
                                          xapian.DB_CREATE_OR_OVERWRITE)
-        axi = xapian.Database(axi_path)
-        logging.info("AptXapianIndex size: %d" % axi.get_doccount())
-        for docid in range(1,axi.get_lastdocid()+1):
+        index = xapian.Database(index_path)
+        for docid in range(1,index.get_lastdocid()+1):
             try:
-                doc = axi.get_document(docid)
-                allterms = [term.term for term in doc.termlist()]
-                if "XTrole::program" in allterms:
+                doc = index.get_document(docid)
+                docterms = [term.term for term in doc.termlist()]
+                tagged = False
+                for t in terms:
+                    if t in docterms:
+                        tagged = True
+                if tagged:
                     self.add_document(doc)
                     logging.info("Added doc %d." % docid)
                 else:
                     logging.info("Discarded doc %d." % docid)
             except:
                 logging.info("Doc %d not found in axi." % docid)
-        logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." %
+        logging.info("Filter: %s" % terms)
+        logging.info("Index size: %d" % index.get_doccount())
+        logging.info("Filtered Index size: %d (lastdocid: %d)." %
                      (self.get_doccount(), self.get_lastdocid()))
  
     def __str__(self):
@@ -186,13 +209,13 @@ class DebianPackage():
         if pkg_version.record.has_key('Conflicts'):
             self.conflicts = pkg_version.record['Conflicts']
         if pkg_version.record.has_key('Replaces'):
-            self.conflicts = pkg_version.record['Replaces']
+            self.replaces = pkg_version.record['Replaces']
         if pkg_version.record.has_key('Provides'):
             self.provides = pkg_version.record['Provides']
  
     def load_details_from_dde(self,dde_server,dde_port):
-        json_data = json.load(urllib.urlopen("http://%s:%s/q/udd/packages/all/%s?t=json"
-                                             % dde_server,dde_port,self.name))
+        json_data = json.load(urllib.urlopen("http://%s:%d/q/udd/packages/prio-debian-sid/%s?t=json"
+                                             % (dde_server,dde_port,self.name)))
  
         self.maintainer = json_data['r']['maintainer']
         self.version = json_data['r']['version']
@@ -200,27 +223,27 @@ class DebianPackage():
         self.description = self.format_description(json_data['r']['long_description'])
         self.section = json_data['r']['section']
         if json_data['r']['homepage']:
-            self.conflicts = json_data['r']['homepage']
+            self.homepage = json_data['r']['homepage']
         if json_data['r']['tag']:
             self.tags = self.debtags_list_to_dict(json_data['r']['tag'])
         if json_data['r']['depends']:
             self.depends = json_data['r']['depends']
         if json_data['r']['pre_depends']:
-            self.conflicts = json_data['r']['pre_depends']
+            self.predepends = json_data['r']['pre_depends']
         if json_data['r']['recommends']:
-            self.conflicts = json_data['r']['recommends']
+            self.recommends = json_data['r']['recommends']
         if json_data['r']['suggests']:
-            self.conflicts = json_data['r']['suggests']
+            self.suggests = json_data['r']['suggests']
         if json_data['r']['conflicts']:
             self.conflicts = json_data['r']['conflicts']
         if json_data['r']['replaces']:
-            self.conflicts = json_data['r']['replaces']
+            self.replaces = json_data['r']['replaces']
         if json_data['r']['provides']:
-            self.conflicts = json_data['r']['provides']
+            self.provides = json_data['r']['provides']
         self.popcon_insts = json_data['r']['popcon']['insts']
  
     def format_description(self,description):
-        return description.replace('.\n','').replace('\n','<br />')
+        return description.replace(' .\n','<br />').replace('\n','<br />')
  
     def debtags_str_to_dict(self, debtags_str):
         debtags_list = [tag.rstrip(",") for tag in debtags_str.split()]
@@ -281,6 +304,7 @@ class PopconSubmission():
     	    for line in submission:
                 if line.startswith("POPULARITY"):
                     self.user_id = line.split()[2].lstrip("ID:")
+                    self.arch = line.split()[3].lstrip("ARCH:")
                 elif not line.startswith("END-POPULARITY"):
                     data = line.rstrip('\n').split()
                     if len(data) > 2:
@@ -304,6 +328,82 @@ class PopconSubmission():
                             elif data[4] == '<RECENT-CTIME>':
                                 self.packages[pkg] = 8
  
+class FilteredPopconXapianIndex(xapian.WritableDatabase):
+    """
+    Data source for popcon submissions defined as a xapian database.
+    """
+    def __init__(self,path,popcon_dir,axi_path,tags_filter):
+        """
+        Set initial attributes.
+        """
+        self.axi = xapian.Database(axi_path)
+        self.path = os.path.expanduser(path)
+        self.popcon_dir = os.path.expanduser(popcon_dir)
+        self.valid_pkgs = axi_get_pkgs(self.axi)
+        logging.debug("Considering %d valid packages" % len(self.valid_pkgs))
+        with open(tags_filter) as valid_tags:
+            self.valid_tags = [line.strip() for line in valid_tags
+                               if not line.startswith("#")]
+        logging.debug("Considering %d valid tags" % len(self.valid_tags))
+        if not os.path.exists(self.popcon_dir):
+            os.makedirs(self.popcon_dir)
+        if not os.listdir(self.popcon_dir):
+            logging.critical("Popcon dir seems to be empty.")
+            raise Error
+
+        # set up directory
+        shutil.rmtree(self.path,1)
+        os.makedirs(self.path)
+        try:
+            logging.info("Indexing popcon submissions from \'%s\'" %
+                         self.popcon_dir)
+            logging.info("Creating new xapian index at \'%s\'" %
+                         self.path)
+            xapian.WritableDatabase.__init__(self,self.path,
+                                             xapian.DB_CREATE_OR_OVERWRITE)
+        except xapian.DatabaseError as e:
+            logging.critical("Could not create popcon xapian index.")
+            logging.critical(str(e))
+            raise Error
+
+        # build new index
+        doc_count = 0
+        for root, dirs, files in os.walk(self.popcon_dir):
+            for popcon_file in files:
+                submission = PopconSubmission(os.path.join(root, popcon_file))
+                doc = xapian.Document()
+                submission_pkgs = submission.get_filtered(self.valid_pkgs)
+                if len(submission_pkgs) < 10:
+                    logging.debug("Low profile popcon submission \'%s\' (%d)" %
+                                  (submission.user_id,len(submission_pkgs)))
+                else:
+                    doc.set_data(submission.user_id)
+                    doc.add_term("ID"+submission.user_id)
+                    doc.add_term("ARCH"+submission.arch)
+                    logging.debug("Parsing popcon submission \'%s\'" %
+                                  submission.user_id)
+                    for pkg,freq in submission_pkgs.items():
+                        tags = axi_search_pkg_tags(self.axi,pkg)
+                        # if the package was found in axi
+                        if tags:
+                            doc.add_term("XP"+pkg,freq)
+                            # if the package has tags associated with it
+                            if not tags == "notags":
+                                for tag in tags:
+                                    if tag.lstrip("XT") in self.valid_tags:
+                                        doc.add_term(tag,freq)
+                    doc_id = self.add_document(doc)
+                    doc_count += 1
+                    logging.debug("Popcon Xapian: Indexing doc %d" % doc_id)
+            # python garbage collector
+        	gc.collect()
+        # flush to disk database changes
+        try:
+            self.commit()
+        except:
+            self.flush() # deprecated function, used for compatibility with old lib version
+
+# Deprecated class, must be reviewed
 class PopconXapianIndex(xapian.WritableDatabase):
     """
     Data source for popcon submissions defined as a singleton xapian database.
@@ -140,6 +140,29 @@ class FPR(Metric):
         return (float(len(evaluation.false_positive))/
                 evaluation.real_negative_len)
  
+class MCC(Metric):
+    """
+    Matthews correlation coefficient.
+    """
+    def __init__(self):
+        """
+        Set metric description.
+        """
+        self.desc = "    MCC    "
+
+    def run(self,evaluation):
+        """
+        Compute metric.
+        """
+        VP = len(evaluation.true_positive)
+        FP = len(evaluation.false_positive)
+        FN = len(evaluation.false_negative)
+        VN = evaluation.true_negative_len
+        if (VP+FP)==0 or (VP+FN)==0 or (VN+FP)==0 or (VN+FN)==0:
+            return 0
+        MCC = (((VP*VN)-(FP*FN))/math.sqrt((VP+FP)*(VP+FN)*(VN+FP)*(VN+FN)))
+        return MCC
+
 class F_score(Metric):
     """
     Classification accuracy metric which correlates precision and recall into an
@@ -0,0 +1,2 @@
+Experiments handled by expsuite:
+https://github.com/rueckstiess/expsuite
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+"""
+    recommender suite - recommender experiments suite 
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+import os
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+
+if __name__ == '__main__':
+
+    cfg = Config()
+    cfg.index_mode = "recluster"
+    logging.info("Starting clustering experiments")
+    logging.info("Medoids: %d\t Max popcon:%d" % (cfg.k_medoids,cfg.max_popcon))
+    cfg.popcon_dir = os.path.expanduser("~/org/popcon.debian.org/popcon-mail/popcon-entries/")
+    cfg.popcon_index = cfg.popcon_index+("_%dmedoids%dmax" %
+                                         (cfg.k_medoids,cfg.max_popcon))
+    cfg.clusters_dir = cfg.clusters_dir+("_%dmedoids%dmax" %
+                                         (cfg.k_medoids,cfg.max_popcon))
+    pxi = PopconXapianIndex(cfg)
+    logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion)
+    # Write clustering log
+    output = open(("results/clustering/%dmedoids%dmax" % (cfg.k_medoids,cfg.max_popcon)),'w')
+    output.write("# k_medoids\tmax_popcon\tdispersion\n")
+    output.write("%d %f\n" % (cfg.k_medoids,cfg.max_popcon,pxi.cluster_dispersion))
+    output.close()
@@ -0,0 +1,27 @@
+[DEFAULT]
+repetitions = 1
+iterations = 10
+path = 'results'
+experiment = 'grid'
+weight = ['bm25', 'trad']
+;profile_size = range(10,100,10)
+;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+sample = [0.6, 0.7, 0.8, 0.9]
+
+[content]
+strategy = ['cb','cbt','cbd']
+
+[clustering]
+experiment = 'single'
+;iterations = 4
+;medoids = range(2,6)
+iterations = 6
+medoids = [100,500,1000,5000,10000,50000]
+;disabled for this experiment
+weight = 0
+profile_size = 0
+sample = 0
+
+[colaborative]
+users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"]
+neighbors = range(10,1010,50)
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+"""
+    recommender suite - recommender experiments suite 
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import expsuite
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+
+class ClusteringSuite(expsuite.PyExperimentSuite):
+    def reset(self, params, rep):
+        self.cfg = Config()
+        self.cfg.popcon_index = "../tests/test_data/.sample_pxi"
+        self.cfg.popcon_dir = "../tests/test_data/popcon_dir"
+        self.cfg.clusters_dir = "../tests/test_data/clusters_dir"
+
+        if params['name'] == "clustering":
+            logging.info("Starting 'clustering' experiments suite...")
+            self.cfg.index_mode = "recluster"
+
+    def iterate(self, params, rep, n):
+        if params['name'] == "clustering":
+            logging.info("Running iteration %d" % params['medoids'][n])
+            self.cfg.k_medoids = params['medoids'][n]
+            pxi = PopconXapianIndex(self.cfg)
+            result = {'k_medoids': params['medoids'][n],
+                   'dispersion': pxi.cluster_dispersion}
+        else:
+            result = {}
+        return result
+
+class ContentBasedSuite(expsuite.PyExperimentSuite):
+    def reset(self, params, rep):
+        if params['name'].startswith("content"):
+            cfg = Config()
+            #if the index was not built yet
+            #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi")
+            cfg.axi = "data/AppAxi"
+            cfg.index_mode = "old"
+            cfg.weight = params['weight']
+            self.rec = Recommender(cfg)
+            self.rec.set_strategy(params['strategy'])
+            self.repo_size = self.rec.items_repository.get_doccount()
+            self.user = LocalSystem()
+            self.user.app_pkg_profile(self.rec.items_repository)
+            self.user.no_auto_pkg_profile()
+            self.sample_size = int(len(self.user.pkg_profile)*params['sample'])
+            # iteration should be set to 10 in config file
+            #self.profile_size = range(10,101,10)
+
+    def iterate(self, params, rep, n):
+        if params['name'].startswith("content"):
+            item_score = dict.fromkeys(self.user.pkg_profile,1)
+            # Prepare partition
+            sample = {}
+            for i in range(self.sample_size):
+                 key = random.choice(item_score.keys())
+                 sample[key] = item_score.pop(key)
+            # Get full recommendation
+            user = User(item_score)
+            recommendation = self.rec.get_recommendation(user,self.repo_size)
+            # Write recall log
+            recall_file = "results/content/recall/%s-%s-%.2f-%d" % \
+                          (params['strategy'],params['weight'],params['sample'],n)
+            output = open(recall_file,'w')
+            output.write("# weight=%s\n" % params['weight'])
+            output.write("# strategy=%s\n" % params['strategy'])
+            output.write("# sample=%f\n" % params['sample'])
+            output.write("\n%d %d %d\n" % \
+                         (self.repo_size,len(item_score),self.sample_size))
+            notfound = []
+            ranks = []
+            for pkg in sample.keys():
+                if pkg in recommendation.ranking:
+                    ranks.append(recommendation.ranking.index(pkg))
+                else:
+                    notfound.append(pkg)
+            for r in sorted(ranks):
+                output.write(str(r)+"\n")
+            if notfound:
+                output.write("Out of recommendation:\n")
+                for pkg in notfound:
+                    output.write(pkg+"\n")
+            output.close()
+            # Plot metrics summary
+            accuracy = []
+            precision = []
+            recall = []
+            f1 = []
+            g = Gnuplot.Gnuplot()
+            g('set style data lines')
+            g.xlabel('Recommendation size')
+            for size in range(1,len(recommendation.ranking)+1,100):
+                predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
+                real = RecommendationResult(sample)
+                evaluation = Evaluation(predicted,real,self.repo_size)
+                accuracy.append([size,evaluation.run(Accuracy())])
+                precision.append([size,evaluation.run(Precision())])
+                recall.append([size,evaluation.run(Recall())])
+                f1.append([size,evaluation.run(F1())])
+            g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
+                   Gnuplot.Data(precision,title="Precision"),
+                   Gnuplot.Data(recall,title="Recall"),
+                   Gnuplot.Data(f1,title="F1"))
+            g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1)
+            # Iteration log
+            result = {'iteration': n,
+                      'weight': params['weight'],
+                      'strategy': params['strategy'],
+                      'accuracy': accuracy[20],
+                      'precision': precision[20],
+                      'recall:': recall[20],
+                      'f1': f1[20]}
+            return result
+
+#class CollaborativeSuite(expsuite.PyExperimentSuite):
+#    def reset(self, params, rep):
+#        if params['name'].startswith("collaborative"):
+#
+#    def iterate(self, params, rep, n):
+#        if params['name'].startswith("collaborative"):
+#            for root, dirs, files in os.walk(self.source_dir):
+#                for popcon_file in files:
+#                    submission = PopconSubmission(os.path.join(root,popcon_file))
+#                    user = User(submission.packages)
+#                    user.maximal_pkg_profile()
+#                    rec.get_recommendation(user)
+#                    precision = 0
+#                    result = {'weight': params['weight'],
+#                              'strategy': params['strategy'],
+#                              'profile_size': self.profile_size[n],
+#                              'accuracy': accuracy,
+#                              'precision': precision,
+#                              'recall:': recall,
+#                              'f1': }
+#        else:
+#            result = {}
+#        return result
+
+if __name__ == '__main__':
+
+    if "clustering" in sys.argv or len(sys.argv)<3:
+        ClusteringSuite().start()
+    if "content" in sys.argv or len(sys.argv)<3:
+        ContentBasedSuite().start()
+    #if "collaborative" in sys.argv or len(sys.argv)<3:
+    #CollaborativeSuite().start()
@@ -0,0 +1,49 @@
+#! /usr/bin/env python
+"""
+    sample-popcon - extract a sample from popcon population
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import xapian
+import os
+import random
+import sys
+
+if __name__ == '__main__':
+    try:
+        sample_file = sys.argv[1]
+    	popcon = xapian.WritableDatabase(sys.argv[2],xapian.DB_OPEN)
+    except:
+        print "Usage: extract-sample-db sample_file popcon_index"
+        exit(1)
+    enquire = xapian.Enquire(popcon)
+    print sample_file.split("/")
+    new_popcon = xapian.WritableDatabase(sys.argv[2]+"-"+sample_file.split("/")[-1],xapian.DB_CREATE_OR_OVERWRITE)
+    print ("Popcon repository size: %d" % popcon.get_doccount())
+    for submission in open(sample_file):
+        print "ID"+submission.strip()
+        query = xapian.Query("ID"+submission.strip())
+        enquire.set_query(query)
+        mset = enquire.get_mset(0,20)
+        for m in mset:
+            print "Adding doc %s"%m.docid
+            new_popcon.add_document(popcon.get_document(m.docid))
+            print "Removing doc %s"%m.docid
+            popcon.delete_document(m.docid)
+    print ("Popcon repository size: %d" % popcon.get_doccount())
+    print ("Popcon repository size: %d" % new_popcon.get_doccount())
@@ -0,0 +1,197 @@
+#!/usr/bin/env python
+"""
+    hybrid-suite
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+
+if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: hybrid strategy sample_file"
+        exit(1)
+
+    iterations = 20
+    profile_size = [10,40,70,100,170,240]
+    neighbor_size = [3,10,50,100,200,400]
+
+    #hybrid_strategies = ['knnco','knnco_eset']
+
+    #iterations = 1
+    #profile_size = [10,20,30]
+    #neighbor_size = [10,20,30]
+
+    cfg = Config()
+    population_sample = []
+    strategy = sys.argv[1]
+    sample_file = sys.argv[2]
+    sample_str = sample_file.split('/')[-1]
+    with open(sample_file,'r') as f:
+        for line in f.readlines():
+            user_id = line.strip('\n')
+            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
+    sample_dir = ("results/hybrid/%s" % sample_str)
+    if not os.path.exists(sample_dir):
+        os.makedirs(sample_dir)
+
+    cfg.strategy = strategy
+    p_20_summary = {}
+    f05_100_summary = {}
+    c_20 = {}
+    c_100 = {}
+
+    log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
+    graph_20 = {}
+    graph_100 = {}
+    graph_20_jpg = {}
+    graph_100_jpg = {}
+    comment_20 = {}
+    comment_100 = {}
+    for k in neighbor_size:
+        graph_20[k] = log_file+("-neighboorhod%.3d-020.png"%k)
+        graph_100[k] = log_file+("-neighboorhod%.3d-100.png"%k)
+        graph_20_jpg[k] = graph_20[k].strip(".png")+".jpg"
+        graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
+        comment_20[k] = graph_20_jpg[k]+".comment"
+        comment_100[k] = graph_100_jpg[k]+".comment"
+
+        with open(comment_20[k],'w') as f:
+            f.write("# %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# neighboorhood\tprofile\tp_20\tc_20\n\n")
+        with open(comment_100[k],'w') as f:
+            f.write("# %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# neighboorhood\tprofile\tf05_100\tc_100\n\n")
+
+        c_20[k] = {}
+        c_100[k] = {}
+        p_20_summary[k] = {}
+        f05_100_summary[k] = {}
+        for size in profile_size:
+            c_20[k][size] = set()
+            c_100[k][size] = set()
+            p_20_summary[k][size] = []
+            f05_100_summary[k][size] = []
+            with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'w') as f:
+                f.write("# %s\n" % sample_str)
+                f.write("# strategy %s-neighboorhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
+                f.write("# p_20\t\tf05_100\n\n")
+
+    # main loop per user
+    for submission_file in population_sample:
+        user = PopconSystem(submission_file)
+        user.filter_pkg_profile(cfg.pkgs_filter)
+        user.maximal_pkg_profile()
+        for k in neighbor_size:
+            cfg.k_neighbors = k
+            for size in profile_size:
+                cfg.profile_size = size
+                rec = Recommender(cfg)
+                repo_size = rec.items_repository.get_doccount()
+                p_20 = []
+                f05_100 = []
+                for n in range(iterations):
+                    # Fill sample profile
+                    profile_len = len(user.pkg_profile)
+                    item_score = {}
+                    for pkg in user.pkg_profile:
+                        item_score[pkg] = user.item_score[pkg]
+                    sample = {}
+                    sample_size = int(profile_len*0.9)
+                    for i in range(sample_size):
+                         key = random.choice(item_score.keys())
+                         sample[key] = item_score.pop(key)
+                    iteration_user = User(item_score)
+                    recommendation = rec.get_recommendation(iteration_user,repo_size)
+                    if hasattr(recommendation,"ranking"):
+                        ranking = recommendation.ranking
+                        real = RecommendationResult(sample)
+                        predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
+                        evaluation = Evaluation(predicted_20,real,repo_size)
+                        p_20.append(evaluation.run(Precision()))
+                        predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
+                        evaluation = Evaluation(predicted_100,real,repo_size)
+                        f05_100.append(evaluation.run(F_score(0.5)))
+                        c_20[k][size] = c_20[k][size].union(recommendation.ranking[:20])
+                        c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
+                # save summary
+                if p_20:
+                    p_20_summary[k][size].append(sum(p_20)/len(p_20))
+                if f05_100:
+                    f05_100_summary[k][size].append(sum(f05_100)/len(f05_100))
+
+                with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'a') as f:
+                    f.write("%.4f\t\t%.4f\n" %
+                            ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
+
+    # back to main flow
+    coverage_20 = {}
+    coverage_100 = {}
+    for k in neighbor_size:
+        coverage_20[k] = {}
+        coverage_100[k] = {}
+        with open(comment_20[k],'a') as f:
+            for size in profile_size:
+                coverage_20[k][size] = len(c_20[k][size])/float(repo_size)
+                f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
+                        (k,size,float(sum(p_20_summary[k][size]))/len(p_20_summary[k][size]),coverage_20[k][size]))
+        with open(comment_100[k],'a') as f:
+            for size in profile_size:
+                coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
+                f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
+                        (k,size,float(sum(f05_100_summary[k][size]))/len(f05_100_summary[k][size]),coverage_100[k][size]))
+
+    for k in neighbor_size:
+        # plot results summary
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('Profile size')
+        g.title("Setup: %s-neighboorhood%3d (threshold 20)" % (cfg.strategy,k))
+        g.plot(Gnuplot.Data(sorted([[i,sum(p_20_summary[k][i])/len(p_20_summary[k][i])]
+                                    for i in p_20_summary[k].keys()]),title="Precision"),
+               Gnuplot.Data(sorted([[i,coverage_20[k][i]]
+                                    for i in coverage_20[k].keys()]),title="Coverage"))
+        g.hardcopy(graph_20[k],terminal="png")
+        #commands.getoutput("convert -quality 100 %s %s" %
+        #                   (graph_20[k],graph_20_jpg[k]))
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('Profile size')
+        g.title("Setup: %s-neighboorhood%3d (threshold 100)" % (cfg.strategy,k))
+        g.plot(Gnuplot.Data(sorted([[i,sum(f05_100_summary[k][i])/len(f05_100_summary[k][i])]
+                                    for i in f05_100_summary[k].keys()]),title="F05"),
+               Gnuplot.Data(sorted([[i,coverage_100[k][i]]
+                                    for i in coverage_100[k].keys()]),title="Coverage"))
+        g.hardcopy(graph_100[k],terminal="png")
+        #commands.getoutput("convert -quality 100 %s %s" %
+        #                   (graph_100[k],graph_100_jpg[k]))
@@ -0,0 +1,186 @@
+#!/usr/bin/env python
+"""
+    k-suite - experiment different neighborhood sizes
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+
+def plot_roc(k,roc_points,log_file):
+    g = Gnuplot.Gnuplot()
+    g('set style data points')
+    g.xlabel('False Positive Rate')
+    g.ylabel('True Positive Rate')
+    g('set xrange [0:1.0]')
+    g('set yrange [0:1.0]')
+    g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
+    g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
+           Gnuplot.Data(roc_points))
+    g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
+    g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
+
+def plot_summary(precision,f05,mcc,log_file):
+    g = Gnuplot.Gnuplot()
+    g('set style data lines')
+    g.xlabel('Neighborhood (k)')
+    g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
+    g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
+           Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
+           Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
+    g.hardcopy(log_file+(".png"),terminal="png")
+    g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
+
+class ExperimentResults:
+    def __init__(self,repo_size):
+        self.repository_size = repo_size
+        self.precision = []
+        self.recall = []
+        self.fpr = []
+        self.f05 = []
+        self.mcc = []
+
+    def add_result(self,ranking,sample):
+        predicted = RecommendationResult(dict.fromkeys(ranking,1))
+        real = RecommendationResult(sample)
+        evaluation = Evaluation(predicted,real,self.repository_size)
+        self.precision.append(evaluation.run(Precision()))
+        self.recall.append(evaluation.run(Recall()))
+        self.fpr.append(evaluation.run(FPR()))
+        self.f05.append(evaluation.run(F_score(0.5)))
+        self.mcc.append(evaluation.run(MCC()))
+
+    def get_roc_point(self):
+        tpr = self.recall
+        fpr = self.fpr
+        if not tpr or not fpr:
+            return [0,0]
+        return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
+
+    def get_precision_summary(self):
+        if not self.precision: return 0
+        return  sum(self.precision)/len(self.precision)
+
+    def get_f05_summary(self):
+        if not self.f05: return 0
+        return  sum(self.f05)/len(self.f05)
+
+    def get_mcc_summary(self):
+        if not self.mcc: return 0
+        return  sum(self.mcc)/len(self.mcc)
+
+if __name__ == '__main__':
+    if len(sys.argv)<3:
+        print "Usage: k-suite strategy_str sample_file"
+        exit(1)
+    threshold = 20
+    iterations = 30
+    neighbors = [3,5,10,50,100,150,200,300,400,500]
+    cfg = Config()
+    cfg.strategy = sys.argv[1]
+    sample_file = sys.argv[2]
+    population_sample = []
+    with open(sample_file,'r') as f:
+        for line in f.readlines():
+            user_id = line.strip('\n')
+            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
+    # setup dictionaries and files
+    roc_summary = {}
+    recommended = {}
+    precision_summary = {}
+    f05_summary = {}
+    mcc_summary = {}
+    sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
+    if not os.path.exists(sample_dir):
+        os.makedirs(sample_dir)
+    log_file = os.path.join(sample_dir,cfg.strategy)
+    with open(log_file,'w') as f:
+        f.write("# %s\n\n" % sample_file.split('/')[-1])
+        f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
+                (cfg.strategy,threshold,iterations))
+        f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
+
+    for k in neighbors:
+        roc_summary[k] = []
+        recommended[k] = set()
+        precision_summary[k] = []
+        f05_summary[k] = []
+        mcc_summary[k] = []
+        with open(log_file+"-k%.3d"%k,'w') as f:
+            f.write("# %s\n\n" % sample_file.split('/')[-1])
+            f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
+            f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
+
+    # main loop per user
+    for submission_file in population_sample:
+        user = PopconSystem(submission_file)
+        user.filter_pkg_profile(cfg.pkgs_filter)
+        user.maximal_pkg_profile()
+        for k in neighbors:
+            cfg.k_neighbors = k
+            rec = Recommender(cfg)
+            repo_size = rec.items_repository.get_doccount()
+            results = ExperimentResults(repo_size)
+            # n iterations for same recommender and user
+            for n in range(iterations):
+                # Fill sample profile
+                profile_len = len(user.pkg_profile)
+                item_score = {}
+                for pkg in user.pkg_profile:
+                    item_score[pkg] = user.item_score[pkg]
+                sample = {}
+                sample_size = int(profile_len*0.9)
+                for i in range(sample_size):
+                     key = random.choice(item_score.keys())
+                     sample[key] = item_score.pop(key)
+                iteration_user = User(item_score)
+                recommendation = rec.get_recommendation(iteration_user,threshold)
+                if hasattr(recommendation,"ranking"):
+                    results.add_result(recommendation.ranking,sample)
+                    recommended[k] = recommended[k].union(recommendation.ranking)
+            # save summary
+            roc_point = results.get_roc_point()
+            roc_summary[k].append(roc_point)
+            precision = results.get_precision_summary()
+            precision_summary[k].append(precision)
+            f05 = results.get_f05_summary()
+            f05_summary[k].append(f05)
+            mcc = results.get_mcc_summary()
+            mcc_summary[k].append(mcc)
+            with open(log_file+"-k%.3d"%k,'a') as f:
+                f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
+                        (roc_point[0],roc_point[1],precision,f05,mcc))
+    # back to main flow
+    with open(log_file,'a') as f:
+        plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
+        for k in neighbors:
+            coverage = len(recommended[size])/float(repo_size)
+            f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
+                    (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
+                     float(sum(f05_summary[k]))/len(f05_summary[k]),
+                     float(sum(mcc_summary[k]))/len(mcc_summary[k])))
+            plot_roc(k,roc_summary[k],log_file)
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+"""
+    recommender suite - recommender experiments suite 
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+import os
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+
+if __name__ == '__main__':
+
+    cfg = Config()
+    cfg.index_mode = "recluster"
+    logging.info("Starting clustering experiments")
+    logging.info("Medoids: %d\t Max popcon:%d" % (cfg.k_medoids,cfg.max_popcon))
+    cfg.popcon_dir = os.path.expanduser("~/org/popcon.debian.org/popcon-mail/popcon-entries/")
+    cfg.popcon_index = cfg.popcon_index+("_%dmedoids%dmax" %
+                                         (cfg.k_medoids,cfg.max_popcon))
+    cfg.clusters_dir = cfg.clusters_dir+("_%dmedoids%dmax" %
+                                         (cfg.k_medoids,cfg.max_popcon))
+    pxi = PopconXapianIndex(cfg)
+    logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion)
+    # Write clustering log
+    output = open(("results/clustering/%dmedoids%dmax" % (cfg.k_medoids,cfg.max_popcon)),'w')
+    output.write("# k_medoids\tmax_popcon\tdispersion\n")
+    output.write("%d %f\n" % (cfg.k_medoids,cfg.max_popcon,pxi.cluster_dispersion))
+    output.close()
@@ -0,0 +1,27 @@
+[DEFAULT]
+repetitions = 1
+iterations = 10
+path = 'results'
+experiment = 'grid'
+weight = ['bm25', 'trad']
+;profile_size = range(10,100,10)
+;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+sample = [0.6, 0.7, 0.8, 0.9]
+
+[content]
+strategy = ['cb','cbt','cbd']
+
+[clustering]
+experiment = 'single'
+;iterations = 4
+;medoids = range(2,6)
+iterations = 6
+medoids = [100,500,1000,5000,10000,50000]
+;disabled for this experiment
+weight = 0
+profile_size = 0
+sample = 0
+
+[colaborative]
+users_repository=["data/popcon","data/popcon-100","data/popcon-500","data/popcon-1000","data/popcon-5000","data/popcon-10000","data/popcon-50000"]
+neighbors = range(10,1010,50)
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+"""
+    recommender suite - recommender experiments suite 
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import expsuite
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+
+class ClusteringSuite(expsuite.PyExperimentSuite):
+    def reset(self, params, rep):
+        self.cfg = Config()
+        self.cfg.popcon_index = "../tests/test_data/.sample_pxi"
+        self.cfg.popcon_dir = "../tests/test_data/popcon_dir"
+        self.cfg.clusters_dir = "../tests/test_data/clusters_dir"
+
+        if params['name'] == "clustering":
+            logging.info("Starting 'clustering' experiments suite...")
+            self.cfg.index_mode = "recluster"
+
+    def iterate(self, params, rep, n):
+        if params['name'] == "clustering":
+            logging.info("Running iteration %d" % params['medoids'][n])
+            self.cfg.k_medoids = params['medoids'][n]
+            pxi = PopconXapianIndex(self.cfg)
+            result = {'k_medoids': params['medoids'][n],
+                   'dispersion': pxi.cluster_dispersion}
+        else:
+            result = {}
+        return result
+
+class ContentBasedSuite(expsuite.PyExperimentSuite):
+    def reset(self, params, rep):
+        if params['name'].startswith("content"):
+            cfg = Config()
+            #if the index was not built yet
+            #app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi")
+            cfg.axi = "data/AppAxi"
+            cfg.index_mode = "old"
+            cfg.weight = params['weight']
+            self.rec = Recommender(cfg)
+            self.rec.set_strategy(params['strategy'])
+            self.repo_size = self.rec.items_repository.get_doccount()
+            self.user = LocalSystem()
+            self.user.app_pkg_profile(self.rec.items_repository)
+            self.user.no_auto_pkg_profile()
+            self.sample_size = int(len(self.user.pkg_profile)*params['sample'])
+            # iteration should be set to 10 in config file
+            #self.profile_size = range(10,101,10)
+
+    def iterate(self, params, rep, n):
+        if params['name'].startswith("content"):
+            item_score = dict.fromkeys(self.user.pkg_profile,1)
+            # Prepare partition
+            sample = {}
+            for i in range(self.sample_size):
+                 key = random.choice(item_score.keys())
+                 sample[key] = item_score.pop(key)
+            # Get full recommendation
+            user = User(item_score)
+            recommendation = self.rec.get_recommendation(user,self.repo_size)
+            # Write recall log
+            recall_file = "results/content/recall/%s-%s-%.2f-%d" % \
+                          (params['strategy'],params['weight'],params['sample'],n)
+            output = open(recall_file,'w')
+            output.write("# weight=%s\n" % params['weight'])
+            output.write("# strategy=%s\n" % params['strategy'])
+            output.write("# sample=%f\n" % params['sample'])
+            output.write("\n%d %d %d\n" % \
+                         (self.repo_size,len(item_score),self.sample_size))
+            notfound = []
+            ranks = []
+            for pkg in sample.keys():
+                if pkg in recommendation.ranking:
+                    ranks.append(recommendation.ranking.index(pkg))
+                else:
+                    notfound.append(pkg)
+            for r in sorted(ranks):
+                output.write(str(r)+"\n")
+            if notfound:
+                output.write("Out of recommendation:\n")
+                for pkg in notfound:
+                    output.write(pkg+"\n")
+            output.close()
+            # Plot metrics summary
+            accuracy = []
+            precision = []
+            recall = []
+            f1 = []
+            g = Gnuplot.Gnuplot()
+            g('set style data lines')
+            g.xlabel('Recommendation size')
+            for size in range(1,len(recommendation.ranking)+1,100):
+                predicted = RecommendationResult(dict.fromkeys(recommendation.ranking[:size],1))
+                real = RecommendationResult(sample)
+                evaluation = Evaluation(predicted,real,self.repo_size)
+                accuracy.append([size,evaluation.run(Accuracy())])
+                precision.append([size,evaluation.run(Precision())])
+                recall.append([size,evaluation.run(Recall())])
+                f1.append([size,evaluation.run(F1())])
+            g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
+                   Gnuplot.Data(precision,title="Precision"),
+                   Gnuplot.Data(recall,title="Recall"),
+                   Gnuplot.Data(f1,title="F1"))
+            g.hardcopy(recall_file+"-plot.ps", enhanced=1, color=1)
+            # Iteration log
+            result = {'iteration': n,
+                      'weight': params['weight'],
+                      'strategy': params['strategy'],
+                      'accuracy': accuracy[20],
+                      'precision': precision[20],
+                      'recall:': recall[20],
+                      'f1': f1[20]}
+            return result
+
+#class CollaborativeSuite(expsuite.PyExperimentSuite):
+#    def reset(self, params, rep):
+#        if params['name'].startswith("collaborative"):
+#
+#    def iterate(self, params, rep, n):
+#        if params['name'].startswith("collaborative"):
+#            for root, dirs, files in os.walk(self.source_dir):
+#                for popcon_file in files:
+#                    submission = PopconSubmission(os.path.join(root,popcon_file))
+#                    user = User(submission.packages)
+#                    user.maximal_pkg_profile()
+#                    rec.get_recommendation(user)
+#                    precision = 0
+#                    result = {'weight': params['weight'],
+#                              'strategy': params['strategy'],
+#                              'profile_size': self.profile_size[n],
+#                              'accuracy': accuracy,
+#                              'precision': precision,
+#                              'recall:': recall,
+#                              'f1': }
+#        else:
+#            result = {}
+#        return result
+
+if __name__ == '__main__':
+
+    if "clustering" in sys.argv or len(sys.argv)<3:
+        ClusteringSuite().start()
+    if "content" in sys.argv or len(sys.argv)<3:
+        ContentBasedSuite().start()
+    #if "collaborative" in sys.argv or len(sys.argv)<3:
+    #CollaborativeSuite().start()
@@ -0,0 +1,74 @@
+#! /usr/bin/env python
+"""
+    misc_popcon - misc experiments with popcon data
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import Gnuplot
+import xapian
+import os
+import random
+import sys
+
+def get_population_profile(popcon):
+    profiles_size = []
+    for n in range(1,popcon.get_doccount()):
+        user = popcon.get_document(n)
+        pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")]
+        if len(pkgs_profile)<10:
+            print "-- profile<10:",user.get_data()
+        profiles_size.append(len(pkgs_profile))
+    max_profile = max(profiles_size)
+    population_profile = [(n,profiles_size.count(n))
+                          for n in range(max_profile+1)
+                          if profiles_size.count(n)>0 ]
+    return population_profile,max_profile
+
+def get_profile_ranges(population_profile,max_profile,popcon_size):
+    ranges = range(0,251,50)
+    ranges.append(max_profile)
+    ranges_population = []
+    ranges_percentage = []
+    for maximum in ranges[1:]:
+        minimum = ranges[ranges.index(maximum)-1]
+        valid = [x[1] for x in population_profile
+                 if x[0]>minimum and x[0]<=maximum]
+        ranges_population.append((maximum,sum(valid)))
+        ranges_percentage.append((maximum,sum(valid)/float(popcon_size)))
+    return ranges_population,ranges_percentage
+
+def plot(data,xlabel,ylabel,output):
+    g = Gnuplot.Gnuplot()
+    g('set style data points')
+    g.xlabel(xlabel)
+    g.ylabel(ylabel)
+    g.plot(data)
+    g.hardcopy(output+".png", terminal="png")
+    g.hardcopy(output+".ps", terminal="postscript", enhanced=1, color=1)
+
+if __name__ == '__main__':
+    popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps"))
+    print ("Popcon repository size: %d" % popcon.get_doccount())
+
+    profile_population,max_profile =  get_population_profile(popcon)
+    ranges_population,ranges_percentage = get_profile_ranges(profile_population,
+                                                             max_profile,popcon.get_doccount())
+    print "Population per profile range (up to index)"
+    print ranges_population
+    plot(profile_population,"Desktop profile size","Population size",
+         "results/misc-popcon/profile_population")
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+"""
+    profile-suite - experiment different profile sizes
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+
+if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: profile-suite strategy_category sample_file"
+        exit(1)
+
+    iterations = 20
+    profile_size = [10,20,40,70,100,140,170,200,240]
+    neighbor_size = [3,5,10,50,100,150,200,300,400,500]
+
+    content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
+    collaborative_strategies = ['knn_eset']#,'knn_eset','knn_plus']
+    #collaborative_strategies = ['knn','knn_eset','knn_plus']
+
+    #iterations = 1
+    #profile_size = [10,20,30]
+    #neighbor_size = [10,20,30]
+    #content_strategies = ['cb']
+    #collaborative_strategies = ['knn_eset']
+
+    strategy_category = sys.argv[1]
+    if strategy_category == "content":
+        strategies = content_strategies
+        sizes = profile_size
+        option_str = "profile"
+    elif strategy_category == "collaborative":
+        strategies = collaborative_strategies
+        sizes = neighbor_size
+        option_str = "neighborhood"
+    else:
+        print "Usage: profile-suite strategy_category sample_file"
+        exit(1)
+
+    cfg = Config()
+    population_sample = []
+    sample_file = sys.argv[2]
+    sample_str = sample_file.split('/')[-1]
+    with open(sample_file,'r') as f:
+        for line in f.readlines():
+            user_id = line.strip('\n')
+            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
+    sample_dir = ("results/%s/%s" %
+                  (strategy_category,sample_str))
+    if not os.path.exists(sample_dir):
+        os.makedirs(sample_dir)
+
+    for strategy in strategies:
+        cfg.strategy = strategy
+        p_20_summary = {}
+        f05_100_summary = {}
+        c_20 = {}
+        c_100 = {}
+
+        log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
+        graph_20 = log_file+"-20.png"
+        graph_100 = log_file+"-100.png"
+        graph_20_jpg = graph_20.strip(".png")+".jpg"
+        graph_100_jpg = graph_100.strip(".png")+".jpg"
+        comment_20 = graph_20_jpg+".comment"
+        comment_100 = graph_100_jpg+".comment"
+
+        with open(comment_20,'w') as f:
+            f.write("# sample %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# %s\tp_20\tc_20\n\n"%option_str)
+        with open(comment_100,'w') as f:
+            f.write("# sample %s\n" % sample_str)
+            f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
+                    (cfg.strategy,iterations))
+            f.write("# %s\t\tf05_100\t\tc_100\n\n"%option_str)
+
+        for size in sizes:
+            c_20[size] = set()
+            c_100[size] = set()
+            p_20_summary[size] = []
+            f05_100_summary[size] = []
+            with open(log_file+"-%s%.3d"%(option_str,size),'w') as f:
+                f.write("# sample %s\n" % sample_str)
+                f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size))
+                f.write("# p_20\tf05_100\n\n")
+
+        # main loop per user
+        for submission_file in population_sample:
+            user = PopconSystem(submission_file)
+            user.filter_pkg_profile(cfg.pkgs_filter)
+            user.maximal_pkg_profile()
+            for size in sizes:
+                cfg.profile_size = size
+                cfg.k_neighbors = size
+                rec = Recommender(cfg)
+                repo_size = rec.items_repository.get_doccount()
+                p_20 = []
+                f05_100 = []
+                for n in range(iterations):
+                    # Fill sample profile
+                    profile_len = len(user.pkg_profile)
+                    item_score = {}
+                    for pkg in user.pkg_profile:
+                        item_score[pkg] = user.item_score[pkg]
+                    sample = {}
+                    sample_size = int(profile_len*0.9)
+                    for i in range(sample_size):
+                         key = random.choice(item_score.keys())
+                         sample[key] = item_score.pop(key)
+                    iteration_user = User(item_score)
+                    recommendation = rec.get_recommendation(iteration_user,repo_size)
+                    if hasattr(recommendation,"ranking"):
+                        ranking = recommendation.ranking
+                        real = RecommendationResult(sample)
+                        predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
+                        evaluation = Evaluation(predicted_20,real,repo_size)
+                        p_20.append(evaluation.run(Precision()))
+                        predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
+                        evaluation = Evaluation(predicted_100,real,repo_size)
+                        f05_100.append(evaluation.run(F_score(0.5)))
+                        c_20[size] = c_20[size].union(recommendation.ranking[:20])
+                        c_100[size] = c_100[size].union(recommendation.ranking[:100])
+                # save summary
+                if p_20:
+                    p_20_summary[size].append(sum(p_20)/len(p_20))
+                if f05_100:
+                    f05_100_summary[size].append(sum(f05_100)/len(f05_100))
+
+                with open(log_file+"-%s%.3d"%(option_str,size),'a') as f:
+                    f.write("%.4f \t%.4f\n" %
+                            ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
+
+        # back to main flow
+        coverage_20 = {}
+        coverage_100 = {}
+        with open(comment_20,'a') as f:
+            for size in sizes:
+                coverage_20[size] = len(c_20[size])/float(repo_size)
+                f.write("%3d\t\t%.4f\t\t%.4f\n" %
+                        (size,float(sum(p_20_summary[size]))/len(p_20_summary[size]),coverage_20[size]))
+        with open(comment_100,'a') as f:
+            for size in sizes:
+                coverage_100[size] = len(c_100[size])/float(repo_size)
+                f.write("%3d\t\t%.4f\t\t%.4f\n" %
+                        (size,float(sum(f05_100_summary[size]))/len(f05_100_summary[size]),coverage_100[size]))
+
+        # plot results summary
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('%s size'%option_str.capitalize())
+        g.title("Setup: %s (threshold 20)" % cfg.strategy)
+        g.plot(Gnuplot.Data(sorted([[k,sum(p_20_summary[k])/len(p_20_summary[k])]
+                                    for k in p_20_summary.keys()]),title="Precision"),
+               Gnuplot.Data(sorted([[k,coverage_20[k]]
+                                    for k in coverage_20.keys()]),title="Coverage"))
+        g.hardcopy(graph_20,terminal="png")
+        commands.getoutput("convert -quality 20 %s %s" %
+                           (graph_100,graph_20_jpg))
+        g = Gnuplot.Gnuplot()
+        g('set style data lines')
+        g('set yrange [0:1.0]')
+        g.xlabel('%s size'%option_str.capitalize())
+        g.title("Setup: %s (threshold 100)" % cfg.strategy)
+        g.plot(Gnuplot.Data(sorted([[k,sum(f05_100_summary[k])/len(f05_100_summary[k])]
+                                    for k in f05_100_summary.keys()]),title="F05"),
+               Gnuplot.Data(sorted([[k,coverage_100[k]]
+                                    for k in coverage_100.keys()]),title="Coverage"))
+        g.hardcopy(graph_100,terminal="png")
+        commands.getoutput("convert -quality 100 %s %s" %
+                           (graph_100,graph_100_jpg))
@@ -0,0 +1,231 @@
+#!/usr/bin/env python
+"""
+    recommender suite - recommender experiments suite 
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+sys.path.insert(0,'../')
+from config import Config
+from data import PopconXapianIndex, PopconSubmission
+from recommender import Recommender
+from user import LocalSystem, User
+from evaluation import *
+import logging
+import random
+import Gnuplot
+import numpy
+
+#iterations = 3
+#sample_proportions = [0.9]
+#weighting = [('bm25',1.2)]
+#collaborative = ['knn_eset']
+#content_based = ['cb']
+#hybrid = ['knnco']
+#profile_size = [50,100]
+#popcon_size = ["1000"]
+#neighbors = [50]
+
+iterations = 30
+sample_proportions = [0.9]
+weighting = [('bm25',1.0)]
+content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
+collaborative = ['knn_eset','knn','knn_plus']
+hybrid = ['knnco','knnco_eset']
+profile_size = range(20,200,40)
+neighbors = range(10,510,50)
+
+def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
+    # Write recall log
+    output = open(("%s-%.2d" % (log_file,n)),'w')
+    output.write("# %s-n\n" % label["description"])
+    output.write("# %s-%.2d\n" % (label["values"],n))
+    output.write("\n# repository profile sample\n%d %d %d\n" % \
+                 (repo_size,profile_size,len(sample)))
+    if hasattr(recommendation,"ranking"):
+        notfound = []
+        ranks = []
+        for pkg in sample.keys():
+            if pkg in recommendation.ranking:
+                ranks.append(recommendation.ranking.index(pkg))
+            else:
+                notfound.append(pkg)
+        for r in sorted(ranks):
+            output.write(str(r)+"\n")
+        if notfound:
+            output.write("# out of recommendation:\n")
+            for pkg in notfound:
+                output.write(pkg+"\n")
+    output.close()
+
+def plot_roc(roc_points,eauc,c,p,log_file):
+    g = Gnuplot.Gnuplot()
+    g('set style data lines')
+    g.xlabel('False Positive Rate')
+    g.ylabel('True Positive Rate')
+    g('set xrange [0:1.0]')
+    g('set yrange [0:1.0]')
+    g.title("Setup: %s" % log_file.split("/")[-1])
+    g('set label "C %.2f" at 0.8,0.25' % c)
+    g('set label "P(20) %.2f" at 0.8,0.2' % p)
+    g('set label "AUC %.4f" at 0.8,0.15' % eauc)
+    g.plot(Gnuplot.Data(roc_points,title="ROC"),
+           Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
+           #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
+    g.hardcopy(log_file+"-roc.png",terminal="png")
+    g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
+
+def get_label(cfg,sample_proportion):
+    label = {}
+    if cfg.strategy in content_based:
+        label["description"] = "strategy-profile"
+        label["values"] = ("%s-profile%.3d" %
+                           (cfg.strategy,cfg.profile_size))
+    elif cfg.strategy in collaborative:
+       label["description"] = "strategy-knn"
+       label["values"] = ("%s-k%.3d" %
+                          (cfg.strategy,cfg.k_neighbors))
+    elif cfg.strategy in hybrid:
+       label["description"] = "strategy-knn-profile"
+       label["values"] = ("%s-k%.3d-profile%.3d" %
+                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
+    else:
+        print "Unknown strategy"
+    return label
+
+class ExperimentResults:
+    def __init__(self,repo_size):
+        self.repository_size = repo_size
+        self.precision = {}
+        self.recall = {}
+        self.fpr = {}
+        points = [1]+range(10,self.repository_size,10)
+        self.recommended = set()
+        for size in points:
+            self.precision[size] = []
+            self.recall[size] = []
+            self.fpr[size] = []
+
+    def add_result(self,ranking,sample):
+        self.recommended = self.recommended.union(ranking)
+        # get data only for point
+        for size in self.precision.keys():
+            predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
+            real = RecommendationResult(sample)
+            evaluation = Evaluation(predicted,real,self.repository_size)
+            self.precision[size].append(evaluation.run(Precision()))
+            self.recall[size].append(evaluation.run(Recall()))
+            self.fpr[size].append(evaluation.run(FPR()))
+
+    # Average ROC by threshold (= size of recommendation)
+    def get_roc_points(self):
+        points = []
+        for size in self.recall.keys():
+            tpr = self.recall[size]
+            fpr = self.fpr[size]
+            points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
+        return sorted(points)
+
+def run_strategy(cfg,user):
+    for weight in weighting:
+        cfg.weight = weight[0]
+        cfg.bm25_k1 = weight[1]
+        rec = Recommender(cfg)
+        repo_size = rec.items_repository.get_doccount()
+        for proportion in sample_proportions:
+            results = ExperimentResults(repo_size)
+            label = get_label(cfg,proportion)
+            user_dir = ("results/roc-suite/%s" % user.user_id[:8])
+            if not os.path.exists(user_dir):
+                os.mkdir(user_dir)
+            log_file = os.path.join(user_dir,label["values"])
+            for n in range(iterations):
+                # Fill sample profile
+                profile_len = len(user.pkg_profile)
+                item_score = {}
+                for pkg in user.pkg_profile:
+                    item_score[pkg] = user.item_score[pkg]
+                sample = {}
+                sample_size = int(profile_len*proportion)
+                for i in range(sample_size):
+                     key = random.choice(item_score.keys())
+                     sample[key] = item_score.pop(key)
+                iteration_user = User(item_score)
+                recommendation = rec.get_recommendation(iteration_user,repo_size)
+                write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
+                if hasattr(recommendation,"ranking"):
+                    results.add_result(recommendation.ranking,sample)
+            with open(log_file,'w') as f:
+                roc_points = results.get_roc_points()
+                x_coord = [p[0] for p in roc_points]
+                y_coord = [p[1] for p in roc_points]
+                auc = numpy.trapz(y=y_coord, x=x_coord)
+                eauc = (auc+
+                        numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
+                        numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
+                precision_20 = sum(results.precision[10])/len(results.precision[10])
+                coverage = len(results.recommended)/float(repo_size)
+                f.write("# %s\n# %s\n\n" %
+                        (label["description"],label["values"]))
+                f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
+                        (coverage,precision_20,auc,eauc))
+            plot_roc(roc_points,eauc,coverage,precision_20,log_file)
+
+def run_content(user,cfg):
+    for strategy in content_based:
+        cfg.strategy = strategy
+        for size in profile_size:
+            cfg.profile_size = size
+            run_strategy(cfg,user)
+
+def run_collaborative(user,cfg):
+    popcon_desktopapps = cfg.popcon_desktopapps
+    popcon_programs = cfg.popcon_programs
+    for strategy in collaborative:
+        cfg.strategy = strategy
+        for k in neighbors:
+            cfg.k_neighbors = k
+            run_strategy(cfg,user)
+
+def run_hybrid(user,cfg):
+    popcon_desktopapps = cfg.popcon_desktopapps
+    popcon_programs = cfg.popcon_programs
+    for strategy in hybrid:
+        cfg.strategy = strategy
+        for k in neighbors:
+            cfg.k_neighbors = k
+            for size in profile_size:
+                cfg.profile_size = size
+                run_strategy(cfg,user)
+
+if __name__ == '__main__':
+    if len(sys.argv)<2:
+        print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]"
+        exit(1)
+
+    cfg = Config()
+    user = PopconSystem(sys.argv[1])
+    user.filter_pkg_profile(cfg.pkgs_filter)
+    user.maximal_pkg_profile()
+
+    if "content" in sys.argv or len(sys.argv)<3:
+        run_content(user,cfg)
+    if "collaborative" in sys.argv or len(sys.argv)<3:
+        run_collaborative(user,cfg)
+    if "hybrid" in sys.argv or len(sys.argv)<3:
+        run_hybrid(user,cfg)
@@ -0,0 +1,44 @@
+#! /usr/bin/env python
+"""
+    sample-popcon-arch - extract a sample of a specific arch
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+import sys
+sys.path.insert(0,'../')
+import xapian
+import os
+import random
+import sys
+from user import RandomPopcon
+
+if __name__ == '__main__':
+    try:
+        size = int(sys.argv[1])
+        arch = sys.argv[2]
+        popcon_dir = sys.argv[3]
+        pkgs_filter = sys.argv[4]
+    except:
+        print "Usage: sample-popcon-arch size arch popcon_dir pkgs_filter"
+        exit(1)
+
+    sample_file = ("results/misc-popcon/sample-%s-%d" % (arch,size))
+    with open(sample_file,'w') as f:
+        for n in range(1,size+1):
+            user = RandomPopcon(popcon_dir,arch,pkgs_filter)
+            f.write(user.user_id+'\n')
+            print "sample",n
@@ -0,0 +1,53 @@
+#! /usr/bin/env python
+"""
+    sample-popcon - extract a sample from popcon population
+"""
+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
+__license__ = """
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import xapian
+import os
+import random
+import sys
+
+def extract_sample(size,popcon,min_profile,max_profile,output):
+    sample = []
+    for n in range(1,popcon.get_doccount()+1):
+        user = popcon.get_document(n)
+        pkgs_profile = [t.term for t in user.termlist() if t.term.startswith("XP")]
+        print len(pkgs_profile)
+        if len(pkgs_profile)>min_profile and len(pkgs_profile)<=max_profile:
+            sample.append(user.get_data())
+        print n,len(sample)
+        if len(sample)==size:
+            break
+    with open(("%s-%d-%d"%(output,min_profile,max_profile)),'w') as f:
+        for s in sample:
+            f.write(s+'\n')
+
+if __name__ == '__main__':
+    popcon = xapian.Database(os.path.expanduser("~/.app-recommender/popcon_desktopapps"))
+    print ("Popcon repository size: %d" % popcon.get_doccount())
+    try:
+        min_profile = int(sys.argv[1])
+        max_profile = int(sys.argv[2])
+        size = int(sys.argv[3])
+    except:
+        print "Usage: sample-popcon min_profile max_profile sample_size"
+        exit(1)
+    sample_file = "results/misc-popcon/sample"
+    extract_sample(size,popcon,min_profile,max_profile,sample_file)
@@ -75,20 +75,20 @@ class Recommender:
         """
         self.cfg = cfg
         # Load xapian indexes
-        self.axi_programs = xapian.Database(cfg.axi_programs)
+        #self.axi_programs = xapian.Database(cfg.axi_programs)
         self.axi_desktopapps = xapian.Database(cfg.axi_desktopapps)
         if cfg.popcon:
-            self.popcon_programs = xapian.Database(cfg.popcon_programs)
+            #self.popcon_programs = xapian.Database(cfg.popcon_programs)
             self.popcon_desktopapps = xapian.Database(cfg.popcon_desktopapps)
         # Load valid programs, desktopapps and tags
         # format: one package or tag name per line
-        self.valid_programs = []
+        #self.valid_programs = []
         self.valid_desktopapps = []
         self.valid_tags = []
         logging.info("Loading recommender filters")
-        with open(os.path.join(cfg.filters_dir,"programs")) as pkgs:
-            self.valid_programs = [line.strip() for line in pkgs
-                                   if not line.startswith("#")]
+        #with open(os.path.join(cfg.filters_dir,"programs")) as pkgs:
+        #    self.valid_programs = [line.strip() for line in pkgs
+        #                           if not line.startswith("#")]
         with open(os.path.join(cfg.filters_dir,"desktopapps")) as pkgs:
             self.valid_desktopapps = [line.strip() for line in pkgs
                                       if not line.startswith("#")]
@@ -109,19 +109,21 @@ class Recommender:
         Set the recommendation strategy.
         """
         logging.info("Setting recommender strategy to \'%s\'" % strategy_str)
-        if self.cfg.pkgs_filter.split("/")[-1] == "desktopapps":
-            self.items_repository = self.axi_desktopapps
-            self.valid_pkgs = self.valid_desktopapps
-        else:
-            self.items_repository = self.axi_programs
-            self.valid_pkgs = self.valid_programs
         # Check if collaborative strategies can be instanciated
-        if ("col" in strategy_str) or ("knn" in strategy_str):
+        if "knn" in strategy_str:
             if not self.cfg.popcon:
                 logging.info("Cannot perform collaborative strategy")
                 return 1
-            else:
-                self.users_repository = self.popcon_programs
+        #if self.cfg.pkgs_filter.split("/")[-1] == "desktopapps":
+        self.items_repository = self.axi_desktopapps
+        self.valid_pkgs = self.valid_desktopapps
+        if "knn" in strategy_str:
+            self.users_repository = self.popcon_desktopapps
+        #else:
+        #    self.items_repository = self.axi_programs
+        #    self.valid_pkgs = self.valid_programs
+        #    if "knn" in strategy_str:
+        #        self.users_repository = self.popcon_programs
         # Set strategy based on strategy_str
         if strategy_str == "cb":
             self.strategy = strategy.ContentBased("mix",self.cfg.profile_size)
@@ -151,8 +153,9 @@ class Recommender:
             self.strategy = strategy.KnnContent(self.cfg.k_neighbors)
         elif strategy_str == "knnco_eset":
             self.strategy = strategy.KnnContentEset(self.cfg.k_neighbors)
-        elif strategy_str.startswith("demo"):
-            self.strategy = strategy.Demographic(strategy_str)
+        # [FIXME: fix repository instanciation]
+        #elif strategy_str.startswith("demo"):
+        #    self.strategy = strategy.Demographic(strategy_str)
         else:
             logging.info("Strategy not defined.")
             return
@@ -20,6 +20,7 @@ __license__ = &quot;&quot;&quot;
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
  
+import os
 import xapian
 from singleton import Singleton
 import recommender
@@ -111,7 +111,7 @@ class User:
     """
     Define a user of a recommender.
     """
-    def __init__(self,item_score,user_id=0,demo_profiles_set=0):
+    def __init__(self,item_score,user_id=0,arch=0,demo_profiles_set=0):
         """
         Set initial user attributes. pkg_profile gets the whole set of items,
         a random user_id is set if none was provided and the demographic
@@ -119,6 +119,7 @@ class User:
         """
         self.item_score = item_score
         self.pkg_profile = self.items()
+        self.arch = arch
  
         if user_id:
             self.user_id = user_id
@@ -272,21 +273,28 @@ class User:
         return self.pkg_profile
  
 class RandomPopcon(User):
-    def __init__(self,submissions_dir,pkgs_filter=0):
+    def __init__(self,submissions_dir,arch=0,pkgs_filter=0):
         """
         Set initial parameters.
         """
         len_profile = 0
-        while len_profile < 100:
+        match_arch = False
+        while len_profile < 100 or not match_arch:
             path = random.choice([os.path.join(root, submission) for
                                   root, dirs, files in os.walk(submissions_dir)
                                   for submission in files])
             user = PopconSystem(path)
+            print arch
+            print user.arch
+            if arch and user.arch==arch:
+                match_arch = True
+                print "match"
             if pkgs_filter:
                 user.filter_pkg_profile(pkgs_filter)
             len_profile = len(user.pkg_profile)
+            print "p",len_profile
         submission = data.PopconSubmission(path)
-        User.__init__(self,submission.packages,submission.user_id)
+        User.__init__(self,submission.packages,submission.user_id,submission.arch)
  
 class PopconSystem(User):
     def __init__(self,path,user_id=0):
@@ -296,7 +304,7 @@ class PopconSystem(User):
         submission = data.PopconSubmission(path)
         if not user_id:
             user_id = submission.user_id
-        User.__init__(self,submission.packages,user_id)
+        User.__init__(self,submission.packages,user_id,submission.arch)
  
 class PkgsListSystem(User):
     def __init__(self,pkgs_list_or_file,user_id=0):
@@ -36,7 +36,7 @@ button below.
 </div>
  
  
-<form action="/save" method="post" enctype="multipart/form-data" name="surveyform">
+<form action="save" method="post" enctype="multipart/form-data" name="surveyform">
  
 <input type="hidden" name="user_id" value=$request.user.user_id>
 <input type="hidden" name="strategy" value=$request.strategy>
...	...	@@ -37,7 +37,7 @@ if __name__ == '__main__':
37	37	#user = LocalSystem()
38	38	#user = RandomPopcon(cfg.popcon_dir)
39	39	#user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
40		- user = PopconSystem("/home/tassia/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
	40	+ user = PopconSystem(os.path.expanduser("~/.app-recommender/popcon-entries/00/0001166d0737c6dffb083071e5ee69f5"))
41	41	user.filter_pkg_profile(os.path.join(cfg.filters_dir,"desktopapps"))
42	42	user.maximal_pkg_profile()
43	43	begin_time = datetime.datetime.now()
...	...	@@ -48,7 +48,7 @@ if __name__ == '__main__':
48	48	metrics.append(F_score(0.5))
49	49	metrics.append(Accuracy())
50	50	metrics.append(FPR())
51		- validation = CrossValidation(0.9,10,rec,metrics,1)
	51	+ validation = CrossValidation(0.9,20,rec,metrics,0.005)
52	52	validation.run(user)
53	53	print validation
54	54
...	...
...	...	@@ -0,0 +1,42 @@
	1	+#!/usr/bin/env python
	2	+"""
	3	+ AppRecommender - A GNU/Linux application recommender
	4	+"""
	5	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	6	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	7	+__license__ = """
	8	+ This program is free software: you can redistribute it and/or modify
	9	+ it under the terms of the GNU General Public License as published by
	10	+ the Free Software Foundation, either version 3 of the License, or
	11	+ (at your option) any later version.
	12	+
	13	+ This program is distributed in the hope that it will be useful,
	14	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+ GNU General Public License for more details.
	17	+
	18	+ You should have received a copy of the GNU General Public License
	19	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+"""
	21	+
	22	+import os
	23	+import sys
	24	+sys.path.insert(0,'../')
	25	+import xapian
	26	+
	27	+if __name__ == '__main__':
	28	+ if len(sys.argv)<2:
	29	+ print "Usage: get_axipkgs index_path"
	30	+ exit(1)
	31	+
	32	+ axi_path = sys.argv[1]
	33	+ axi = xapian.Database(axi_path)
	34	+ for n in range(1,axi.get_lastdocid()):
	35	+ doc = 0
	36	+ try:
	37	+ doc = axi.get_document(n)
	38	+ except:
	39	+ pass
	40	+ if doc:
	41	+ xp_terms = [t.term for t in doc.termlist() if t.term.startswith("XP")]
	42	+ print xp_terms[0].lstrip('XP')
...	...
1	1	#!/usr/bin/env bash
2	2	#
3		-# get_desktop.sh - get packages which have desktop files
	3	+# get_desktop.sh - get packages which have desktop files
	4	+#
	5	+# DEPRECATED: use get_axipkgs.py to get this info from axi
4	6
5	7	cd /usr/share/app-install/desktop
6	8	sed -ne 's/X-AppInstall-Package=//p' * \| sort -u \| grep -v kdelibs \| grep -v libfm-gtk0
...	...
1	1	#!/usr/bin/env python
2	2	#
3	3	# get_pkgs_inst.py - get tuple (package,installation) from popcon results file
	4	+#
	5	+# results_file: org/popcon.debian.org/popcon-mail/results
4	6
	7	+import sys
5	8	from operator import itemgetter
	9	+
6	10	if __name__ == '__main__':
	11	+ if len(sys.argv)<2:
	12	+ print "Usage: get_pkgs_inst popcon_results_path"
	13	+ exit(1)
	14	+
	15	+ results_path = sys.argv[1]
7	16	pkgs_inst = {}
8		- with open("/root/org/popcon.debian.org/popcon-mail/results") as results:
	17	+ with open(results_path) as results:
9	18	for line in results:
10	19	if line.startswith("Package"):
11	20	fields = line.split()
12	21	inst = int(fields[2])+int(fields[3])+int(fields[4])
13		- if inst > 20:
14		- pkgs_inst[fields[1]] = inst
	22	+ pkgs_inst[fields[1]] = inst
15	23	sorted_by_inst = sorted(pkgs_inst.items(), key=itemgetter(1))
16	24	for pkg, inst in sorted_by_inst:
17	25	print pkg, inst
...	...
...	...	@@ -0,0 +1,77 @@
	1	+#!/usr/bin/env python
	2	+"""
	3	+ indexer.py - generate xapian indexes to be used as items and users
	4	+ repositories
	5	+"""
	6	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	7	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	8	+__license__ = """
	9	+ This program is free software: you can redistribute it and/or modify
	10	+ it under the terms of the GNU General Public License as published by
	11	+ the Free Software Foundation, either version 3 of the License, or
	12	+ (at your option) any later version.
	13	+
	14	+ This program is distributed in the hope that it will be useful,
	15	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	16	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	17	+ GNU General Public License for more details.
	18	+
	19	+ You should have received a copy of the GNU General Public License
	20	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	21	+"""
	22	+
	23	+import os
	24	+import sys
	25	+sys.path.insert(0,'../')
	26	+import datetime
	27	+
	28	+from config import Config
	29	+from error import Error
	30	+import data
	31	+import xapian
	32	+
	33	+if __name__ == '__main__':
	34	+ axi_path = "/var/lib/apt-xapian-index/index"
	35	+ axi = xapian.Database(axi_path)
	36	+ base_dir = os.path.expanduser("~/.app-recommender/")
	37	+
	38	+ begin_time = datetime.datetime.now()
	39	+
	40	+ # axi sample based on the pkgs sample provided by command line
	41	+ if "sample" in sys.argv:
	42	+ print ("Sample package indexing started at %s" % begin_time)
	43	+ if len(sys.argv) > 2:
	44	+ pkgs_filter = sys.argv[2]
	45	+ else:
	46	+ print "Usage: indexer axi_sample pkgs_sample_file"
	47	+ exit(1)
	48	+ with open(pkgs_filter) as valid:
	49	+ pkgs_list = [line.strip() for line in valid]
	50	+ filter_str = pkgs_filter.split("/")[-1]
	51	+ index = data.SampleAptXapianIndex(pkgs_list,axi,
	52	+ os.path.join(base_dir,"axi_"+filter_str))
	53	+ print ("Axi size: %d" % axi.get_doccount())
	54	+ print ("Packages list length: %d" % len(pkgs_list))
	55	+ print ("Sample index size: %d" %
	56	+ index.get_doccount())
	57	+
	58	+ # axi filtered by terms provided by command line
	59	+ if "filter" in sys.argv:
	60	+ print ("Filtered package indexing started at %s" % begin_time)
	61	+ if len(sys.argv) > 2:
	62	+ terms = sys.argv[2:]
	63	+ else:
	64	+ print ("Usage: indexer axi_filter term [additional terms]")
	65	+ exit(1)
	66	+ terms_str = "_".join([t.split("::")[-1] for t in terms])
	67	+ index = data.FilteredXapianIndex(terms,axi,
	68	+ os.path.join(base_dir,"axi_"+terms_str))
	69	+ print ("Axi size: %d" % axi.get_doccount())
	70	+ print ("Terms filter: %s" % terms)
	71	+ print ("Filtered index size: %d" %
	72	+ index.get_doccount())
	73	+
	74	+ end_time = datetime.datetime.now()
	75	+ print ("Indexing completed at %s" % end_time)
	76	+ delta = end_time - begin_time
	77	+ print ("Time elapsed: %d seconds." % delta.seconds)
...	...
...	...	@@ -0,0 +1,52 @@
	1	+#!/usr/bin/env python
	2	+"""
	3	+ popindex.py - generate a popcon index to be used by the recommender as the
	4	+ users repository, based on filters provided by config
	5	+"""
	6	+__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
	7	+__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
	8	+__license__ = """
	9	+ This program is free software: you can redistribute it and/or modify
	10	+ it under the terms of the GNU General Public License as published by
	11	+ the Free Software Foundation, either version 3 of the License, or
	12	+ (at your option) any later version.
	13	+
	14	+ This program is distributed in the hope that it will be useful,
	15	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	16	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	17	+ GNU General Public License for more details.
	18	+
	19	+ You should have received a copy of the GNU General Public License
	20	+ along with this program. If not, see <http://www.gnu.org/licenses/>.
	21	+"""
	22	+import os
	23	+import sys
	24	+sys.path.insert(0,'../')
	25	+import logging
	26	+import datetime
	27	+
	28	+from config import Config
	29	+from data import FilteredPopconXapianIndex
	30	+
	31	+if __name__ == '__main__':
	32	+ base_dir = os.path.expanduser("~/.app-recommender/")
	33	+ axi_path = os.path.join(base_dir,"axi_XD")
	34	+ path = os.path.join(base_dir,"popcon_XD")
	35	+ popcon_dir = os.path.join(base_dir,"popcon-entries")
	36	+ tags_filter = os.path.join(base_dir,"filters/debtags")
	37	+
	38	+ # set up config for logging
	39	+ cfg = Config()
	40	+
	41	+ begin_time = datetime.datetime.now()
	42	+ logging.info("Popcon indexing started at %s" % begin_time)
	43	+ # use config file or command line options
	44	+ index = FilteredPopconXapianIndex(path,popcon_dir,axi_path,tags_filter)
	45	+
	46	+ end_time = datetime.datetime.now()
	47	+ logging.info("Popcon indexing completed at %s" % end_time)
	48	+ logging.info("Number of documents (submissions): %d" %
	49	+ index.get_doccount())
	50	+
	51	+ delta = end_time - begin_time
	52	+ logging.info("Time elapsed: %d seconds." % delta.seconds)
...	...
...	...	@@ -40,7 +40,7 @@ class Config(Singleton):
40	40	## general options
41	41	self.debug = 0
42	42	self.verbose = 1
43		- self.output = "log"
	43	+ self.output = "apprec.log"
44	44
45	45	## data_source options
46	46	self.base_dir = os.path.expanduser("~/.app-recommender/")
...	...	@@ -103,13 +103,14 @@ class Config(Singleton):
103	103	print " -f, --filtersdir=PATH Path to filters directory"
104	104	print " -b, --pkgsfilter=FILTER File containing packages to be considered for recommendations"
105	105	print " -a, --axi=PATH Path to apt-xapian-index"
106		- print " -e, --dde=URL DDE url"
107	106	print " -p, --popconindex=PATH Path to popcon index"
108		- print " -m, --popcondir=PATH Path to popcon submissions dir"
109		- print " -u, --indexmode=MODE 'old'\|'reindex'\|'cluster'\|'recluster'"
110		- print " -l, --clustersdir=PATH Path to popcon clusters dir"
111		- print " -c, --medoids=k Number of medoids for clustering"
112		- print " -x, --maxpopcon=k Number of submissions to be considered"
	107	+ print " -e, --dde=URL DDE url"
	108	+ # deprecated options
	109	+ #print " -m, --popcondir=PATH Path to popcon submissions dir"
	110	+ #print " -u, --indexmode=MODE 'old'\|'reindex'\|'cluster'\|'recluster'"
	111	+ #print " -l, --clustersdir=PATH Path to popcon clusters dir"
	112	+ #print " -c, --medoids=k Number of medoids for clustering"
	113	+ #print " -x, --maxpopcon=k Number of submissions to be considered"
113	114	print ""
114	115	print " [ recommender ]"
115	116	print " -w, --weight=OPTION Search weighting scheme"
...	...	@@ -123,11 +124,19 @@ class Config(Singleton):
123	124	print " bm25 = bm25 weighting scheme"
124	125	print ""
125	126	print " [ strategy options ] "
126		- print " cb = content-based "
127		- print " cbt = content-based using only tags as content "
128		- print " cbd = content-based using only package descriptions as content "
129		- print " col = collaborative "
130		- print " colct = collaborative through tags content "
	127	+ print " cb = content-based, mixed profile"
	128	+ print " cbt = content-based, tags only profile"
	129	+ print " cbd = content-based, description terms only profile"
	130	+ print " cbh = content-based, half-half profile"
	131	+ print " cb_eset = cb with eset profiling"
	132	+ print " cbt_eset = cbt with eset profiling"
	133	+ print " cbd_eset = cbd_eset with eset profiling"
	134	+ print " cbh_eset = cbh with eset profiling"
	135	+ print " knn = collaborative, tf-idf knn"
	136	+ print " knn_plus = collaborative, tf-idf weighted knn"
	137	+ print " knn_eset = collaborative, eset knn"
	138	+ print " knnco = collaborative through content"
	139	+ print " knnco_eset = collaborative through content, eset recommendation"
131	140
132	141	def read_option(self, section, option):
133	142	"""
...	...
...	...	@@ -30,12 +30,26 @@ import shutil
30	30	import apt
31	31	import re
32	32	import operator
	33	+import urllib
	34	+import simplejson as json
33	35
34	36	from error import Error
35	37	from singleton import Singleton
36	38	from dissimilarity import *
37	39	from config import Config
38	40
	41	+def axi_get_pkgs(axi):
	42	+ pkgs_names = []
	43	+ for docid in range(1,axi.get_lastdocid()+1):
	44	+ try:
	45	+ doc = axi.get_document(docid)
	46	+ except:
	47	+ pass
	48	+ docterms_XP = [t.term for t in doc.termlist()
	49	+ if t.term.startswith("XP")]
	50	+ pkgs_names.append(docterms_XP[0].lstrip('XP'))
	51	+ return pkgs_names
	52	+
39	53	def axi_search_pkgs(axi,pkgs_list):
40	54	terms = ["XP"+item for item in pkgs_list]
41	55	query = xapian.Query(xapian.Query.OP_OR, terms)
...	...	@@ -110,30 +124,39 @@ def tfidf_plus(index,docs,content_filter):
110	124	variance = sum([(p-mean)*(p-mean) for p in population])/len(population)
111	125	standard_deviation = math.sqrt(variance)
112	126	for d in docs:
113		- normalized_weigths[d.docid] = d.weight/standard_deviation
	127	+ if standard_deviation>1:
	128	+ # values between [0-1] would cause the opposite effect
	129	+ normalized_weigths[d.docid] = d.weight/standard_deviation
	130	+ else:
	131	+ normalized_weigths[d.docid] = d.weight
114	132	return tfidf_weighting(index,docs,content_filter,normalized_weigths)
115	133
116		-class AppAptXapianIndex(xapian.WritableDatabase):
	134	+class FilteredXapianIndex(xapian.WritableDatabase):
117	135	"""
118		- Data source for application packages information
	136	+ Filtered Xapian Index
119	137	"""
120		- def __init__(self,axi_path,path):
	138	+ def __init__(self,terms,index_path,path):
121	139	xapian.WritableDatabase.__init__(self,path,
122	140	xapian.DB_CREATE_OR_OVERWRITE)
123		- axi = xapian.Database(axi_path)
124		- logging.info("AptXapianIndex size: %d" % axi.get_doccount())
125		- for docid in range(1,axi.get_lastdocid()+1):
	141	+ index = xapian.Database(index_path)
	142	+ for docid in range(1,index.get_lastdocid()+1):
126	143	try:
127		- doc = axi.get_document(docid)
128		- allterms = [term.term for term in doc.termlist()]
129		- if "XTrole::program" in allterms:
	144	+ doc = index.get_document(docid)
	145	+ docterms = [term.term for term in doc.termlist()]
	146	+ tagged = False
	147	+ for t in terms:
	148	+ if t in docterms:
	149	+ tagged = True
	150	+ if tagged:
130	151	self.add_document(doc)
131	152	logging.info("Added doc %d." % docid)
132	153	else:
133	154	logging.info("Discarded doc %d." % docid)
134	155	except:
135	156	logging.info("Doc %d not found in axi." % docid)
136		- logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." %
	157	+ logging.info("Filter: %s" % terms)
	158	+ logging.info("Index size: %d" % index.get_doccount())
	159	+ logging.info("Filtered Index size: %d (lastdocid: %d)." %
137	160	(self.get_doccount(), self.get_lastdocid()))
138	161
139	162	def __str__(self):
...	...	@@ -186,13 +209,13 @@ class DebianPackage():
186	209	if pkg_version.record.has_key('Conflicts'):
187	210	self.conflicts = pkg_version.record['Conflicts']
188	211	if pkg_version.record.has_key('Replaces'):
189		- self.conflicts = pkg_version.record['Replaces']
	212	+ self.replaces = pkg_version.record['Replaces']
190	213	if pkg_version.record.has_key('Provides'):
191	214	self.provides = pkg_version.record['Provides']
192	215
193	216	def load_details_from_dde(self,dde_server,dde_port):
194		- json_data = json.load(urllib.urlopen("http://%s:%s/q/udd/packages/all/%s?t=json"
195		- % dde_server,dde_port,self.name))
	217	+ json_data = json.load(urllib.urlopen("http://%s:%d/q/udd/packages/prio-debian-sid/%s?t=json"
	218	+ % (dde_server,dde_port,self.name)))
196	219
197	220	self.maintainer = json_data['r']['maintainer']
198	221	self.version = json_data['r']['version']
...	...	@@ -200,27 +223,27 @@ class DebianPackage():
200	223	self.description = self.format_description(json_data['r']['long_description'])
201	224	self.section = json_data['r']['section']
202	225	if json_data['r']['homepage']:
203		- self.conflicts = json_data['r']['homepage']
	226	+ self.homepage = json_data['r']['homepage']
204	227	if json_data['r']['tag']:
205	228	self.tags = self.debtags_list_to_dict(json_data['r']['tag'])
206	229	if json_data['r']['depends']:
207	230	self.depends = json_data['r']['depends']
208	231	if json_data['r']['pre_depends']:
209		- self.conflicts = json_data['r']['pre_depends']
	232	+ self.predepends = json_data['r']['pre_depends']
210	233	if json_data['r']['recommends']:
211		- self.conflicts = json_data['r']['recommends']
	234	+ self.recommends = json_data['r']['recommends']
212	235	if json_data['r']['suggests']:
213		- self.conflicts = json_data['r']['suggests']
	236	+ self.suggests = json_data['r']['suggests']
214	237	if json_data['r']['conflicts']:
215	238	self.conflicts = json_data['r']['conflicts']
216	239	if json_data['r']['replaces']:
217		- self.conflicts = json_data['r']['replaces']
	240	+ self.replaces = json_data['r']['replaces']
218	241	if json_data['r']['provides']:
219		- self.conflicts = json_data['r']['provides']
	242	+ self.provides = json_data['r']['provides']
220	243	self.popcon_insts = json_data['r']['popcon']['insts']
221	244
222	245	def format_description(self,description):
223		- return description.replace('.\n','').replace('\n','<br />')
	246	+ return description.replace(' .\n','<br />').replace('\n','<br />')
224	247
225	248	def debtags_str_to_dict(self, debtags_str):
226	249	debtags_list = [tag.rstrip(",") for tag in debtags_str.split()]
...	...	@@ -281,6 +304,7 @@ class PopconSubmission():
281	304	for line in submission:
282	305	if line.startswith("POPULARITY"):
283	306	self.user_id = line.split()[2].lstrip("ID:")
	307	+ self.arch = line.split()[3].lstrip("ARCH:")
284	308	elif not line.startswith("END-POPULARITY"):
285	309	data = line.rstrip('\n').split()
286	310	if len(data) > 2:
...	...	@@ -304,6 +328,82 @@ class PopconSubmission():
304	328	elif data[4] == '<RECENT-CTIME>':
305	329	self.packages[pkg] = 8
306	330
	331	+class FilteredPopconXapianIndex(xapian.WritableDatabase):
	332	+ """
	333	+ Data source for popcon submissions defined as a xapian database.
	334	+ """
	335	+ def __init__(self,path,popcon_dir,axi_path,tags_filter):
	336	+ """
	337	+ Set initial attributes.
	338	+ """
	339	+ self.axi = xapian.Database(axi_path)
	340	+ self.path = os.path.expanduser(path)
	341	+ self.popcon_dir = os.path.expanduser(popcon_dir)
	342	+ self.valid_pkgs = axi_get_pkgs(self.axi)
	343	+ logging.debug("Considering %d valid packages" % len(self.valid_pkgs))
	344	+ with open(tags_filter) as valid_tags:
	345	+ self.valid_tags = [line.strip() for line in valid_tags
	346	+ if not line.startswith("#")]
	347	+ logging.debug("Considering %d valid tags" % len(self.valid_tags))
	348	+ if not os.path.exists(self.popcon_dir):
	349	+ os.makedirs(self.popcon_dir)
	350	+ if not os.listdir(self.popcon_dir):
	351	+ logging.critical("Popcon dir seems to be empty.")
	352	+ raise Error
	353	+
	354	+ # set up directory
	355	+ shutil.rmtree(self.path,1)
	356	+ os.makedirs(self.path)
	357	+ try:
	358	+ logging.info("Indexing popcon submissions from \'%s\'" %
	359	+ self.popcon_dir)
	360	+ logging.info("Creating new xapian index at \'%s\'" %
	361	+ self.path)
	362	+ xapian.WritableDatabase.__init__(self,self.path,
	363	+ xapian.DB_CREATE_OR_OVERWRITE)
	364	+ except xapian.DatabaseError as e:
	365	+ logging.critical("Could not create popcon xapian index.")
	366	+ logging.critical(str(e))
	367	+ raise Error
	368	+
	369	+ # build new index
	370	+ doc_count = 0
	371	+ for root, dirs, files in os.walk(self.popcon_dir):
	372	+ for popcon_file in files:
	373	+ submission = PopconSubmission(os.path.join(root, popcon_file))
	374	+ doc = xapian.Document()
	375	+ submission_pkgs = submission.get_filtered(self.valid_pkgs)
	376	+ if len(submission_pkgs) < 10:
	377	+ logging.debug("Low profile popcon submission \'%s\' (%d)" %
	378	+ (submission.user_id,len(submission_pkgs)))
	379	+ else:
	380	+ doc.set_data(submission.user_id)
	381	+ doc.add_term("ID"+submission.user_id)
	382	+ doc.add_term("ARCH"+submission.arch)
	383	+ logging.debug("Parsing popcon submission \'%s\'" %
	384	+ submission.user_id)
	385	+ for pkg,freq in submission_pkgs.items():
	386	+ tags = axi_search_pkg_tags(self.axi,pkg)
	387	+ # if the package was found in axi
	388	+ if tags:
	389	+ doc.add_term("XP"+pkg,freq)
	390	+ # if the package has tags associated with it
	391	+ if not tags == "notags":
	392	+ for tag in tags:
	393	+ if tag.lstrip("XT") in self.valid_tags:
	394	+ doc.add_term(tag,freq)
	395	+ doc_id = self.add_document(doc)
	396	+ doc_count += 1
	397	+ logging.debug("Popcon Xapian: Indexing doc %d" % doc_id)
	398	+ # python garbage collector
	399	+ gc.collect()
	400	+ # flush to disk database changes
	401	+ try:
	402	+ self.commit()
	403	+ except:
	404	+ self.flush() # deprecated function, used for compatibility with old lib version
	405	+
	406	+# Deprecated class, must be reviewed
307	407	class PopconXapianIndex(xapian.WritableDatabase):
308	408	"""
309	409	Data source for popcon submissions defined as a singleton xapian database.
...	...
...	...	@@ -140,6 +140,29 @@ class FPR(Metric):
140	140	return (float(len(evaluation.false_positive))/
141	141	evaluation.real_negative_len)
142	142
	143	+class MCC(Metric):
	144	+ """
	145	+ Matthews correlation coefficient.
	146	+ """
	147	+ def __init__(self):
	148	+ """
	149	+ Set metric description.
	150	+ """
	151	+ self.desc = " MCC "
	152	+
	153	+ def run(self,evaluation):
	154	+ """
	155	+ Compute metric.
	156	+ """
	157	+ VP = len(evaluation.true_positive)
	158	+ FP = len(evaluation.false_positive)
	159	+ FN = len(evaluation.false_negative)
	160	+ VN = evaluation.true_negative_len
	161	+ if (VP+FP)==0 or (VP+FN)==0 or (VN+FP)==0 or (VN+FN)==0:
	162	+ return 0
	163	+ MCC = (((VPVN)-(FPFN))/math.sqrt((VP+FP)(VP+FN)(VN+FP)*(VN+FN)))
	164	+ return MCC
	165	+
143	166	class F_score(Metric):
144	167	"""
145	168	Classification accuracy metric which correlates precision and recall into an
...	...
...	...	@@ -0,0 +1,2 @@
	1	+Experiments handled by expsuite:
	2	+https://github.com/rueckstiess/expsuite
...	...