Merging content of survey index.

Tássia Camões Araújo
2 parents 5075d5c0 1e970ec5
Showing 8 changed files with 69 additions and 37 deletions Show diff stats
src/config.py
src/data.py
src/examples/cross_validation.py
src/experiments/experiments.cfg
src/web/templates/about.html
src/web/templates/layout.html
src/web/templates/survey.html
src/web/templates/survey_index.html
@@ -46,6 +46,7 @@ class Config():
         self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir")
         self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir")
         self.k_medoids = 100
+        self.max_popcon = 1000
         self.index_mode = "old"
         self.strategy = "cb"
         self.weight = "bm25"
@@ -71,6 +72,7 @@ class Config():
         print "  -u, --indexmode=           'old'|'reindex'|'cluster'|'recluster'"
         print "  -l, --clustersdir=PATH     Path to popcon clusters dir"
         print "  -c, --medoids=k            Number of medoids for clustering"
+        print "  -x, --maxpopcon=k          Number of submissions to be considered"
         print ""
         print " [ recommender ]"
         print "  -w, --weight=OPTION        Search weighting scheme"
@@ -112,8 +114,8 @@ class Config():
             logging.error("Error in config file syntax: %s", str(err))
             os.abort()
-        self.debug = self.read_option('general', 'debug')
-        self.debug = self.read_option('general', 'verbose')
+        self.debug = int(self.read_option('general', 'debug'))
+        self.debug = int(self.read_option('general', 'verbose'))
         self.output_filename = self.read_option('general', 'output')
         self.survey_mode = self.read_option('general', 'survey_mode')
@@ -123,16 +125,18 @@ class Config():
         self.popcon_dir = os.path.expanduser(self.read_option('data_sources', 'popcon_dir'))
         self.index_mode = self.read_option('data_sources', 'index_mode')
         self.clusters_dir = os.path.expanduser(self.read_option('data_sources', 'clusters_dir'))
-        self.k_medoids = self.read_option('data_sources', 'k_medoids')
+        self.k_medoids = int(self.read_option('data_sources', 'k_medoids'))
+        self.max_popcon = int(self.read_option('data_sources', 'max_popcon'))
         self.weight = self.read_option('recommender', 'weight')
         self.strategy = self.read_option('recommender', 'strategy')
-        self.profile_size = self.read_option('recommender', 'profile_size')
+        self.profile_size = int(self.read_option('recommender',
+                                                 'profile_size'))
-        short_options = "hdvo:a:e:p:m:ul:c:w:s:z:"
+        short_options = "hdvo:a:e:p:m:ul:c:x:w:s:z:"
         long_options = ["help", "debug", "verbose", "output=",
                         "axi=", "dde=", "popconindex=", "popcondir=", "indexmode=",
-                        "clustersdir=", "kmedoids=", "weight=", "strategy=",
+                        "clustersdir=", "kmedoids=", "max_popcon=", "weight=", "strategy=",
                         "profile_size="]
         try:
             opts, args = getopt.getopt(sys.argv[1:], short_options,
@@ -166,13 +170,15 @@ class Config():
             elif o in ("-l", "--clustersdir"):
                 self.clusters_dir = p
             elif o in ("-c", "--kmedoids"):
-                self.k_medoids = p
+                self.k_medoids = int(p)
+            elif o in ("-x", "--max_popcon"):
+                self.max_popcon = int(p)
             elif o in ("-w", "--weight"):
                 self.weight = p
             elif o in ("-s", "--strategy"):
                 self.strategy = p
             elif o in ("-z", "--profile_size"):
-                self.strategy = p
+                self.strategy = int(p)
             else:
                 assert False, "unhandled option"
@@ -82,7 +82,7 @@ class AppAptXapianIndex(xapian.WritableDatabase):
             except:
                 logging.info("Doc %d not found in axi." % docid)
         logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." %
-                     self.get_doccount(), self.get_lastdocid())
+                     (self.get_doccount(), self.get_lastdocid()))
     def __str__(self):
         return print_index(self)
@@ -166,6 +166,7 @@ class PopconXapianIndex(xapian.WritableDatabase):
                 raise Error
             if cfg.index_mode == "reindex":
                 self.source_dir = os.path.expanduser(cfg.popcon_dir)
+                logging.debug(self.source_dir)
             else:
                 self.source_dir = os.path.expanduser(cfg.clusters_dir)
                 if not os.path.exists(cfg.clusters_dir):
@@ -180,10 +181,12 @@ class PopconXapianIndex(xapian.WritableDatabase):
                                  % cfg.clusters_dir)
                     distance = JaccardDistance()
                     data = self.get_submissions(cfg.popcon_dir)
+                    logging.debug(type(data))
                     self.cluster_dispersion = \
                         self.kmedoids_clustering(data, cfg.clusters_dir,
-                                                 distance, cfg.k_medoids)
-                    logging.info("Clusters dispersion: %f.2",
+                                                 distance, cfg.k_medoids,
+                                                 cfg.max_popcon)
+                    logging.info("Clusters dispersion: %.2f",
                                  self.cluster_dispersion)
                 else:
                     logging.info("Using clusters from \'%s\'" %
@@ -221,8 +224,9 @@ class PopconXapianIndex(xapian.WritableDatabase):
                          self.path)
             xapian.WritableDatabase.__init__(self,self.path,
                                              xapian.DB_CREATE_OR_OVERWRITE)
-        except xapian.DatabaseError:
+        except xapian.DatabaseError as e:
             logging.critical("Could not create popcon xapian index.")
+            logging.critical(str(e))
             raise Error
         for root, dirs, files in os.walk(self.source_dir):
@@ -254,29 +258,32 @@ class PopconXapianIndex(xapian.WritableDatabase):
         submissions = []
         for root, dirs, files in os.walk(submissions_dir):
             for popcon_file in files:
+                logging.debug("Parsing submission %s" % popcon_file)
                 submission = PopconSubmission(os.path.join(root, popcon_file))
                 submissions.append(submission)
         return submissions
-    def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids):
+    def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids,max_popcon):
         clusters = KMedoidsClustering(data,lambda x,y:
                                            distance(x.packages.keys(),
-                                                    y.packages.keys()))
+                                                    y.packages.keys()),max_popcon)
         medoids,dispersion = clusters.getMedoids(k_medoids)
         for submission in medoids:
+            logging.debug("Copying submission %s" % submission.user_id)
             shutil.copyfile(submission.path,os.path.join(clusters_dir,
                                                          submission.user_id))
         return dispersion
 class KMedoidsClustering(cluster.KMeansClustering):
-    def __init__(self,data,distance,max_data=100):
-       # if len(data)<max_data:
-       #     data_sample = data
-       # else:
-       #     data_sample = random.sample(data,max_data)
-       # cluster.KMeansClustering.__init__(self, data_sample, distance)
-        cluster.KMeansClustering.__init__(self, data, distance)
+    def __init__(self,data,distance,max_data):
+        if len(data)<max_data:
+            data_sample = data
+        else:
+            data_sample = random.sample(data,max_data)
+        print data_sample
+        cluster.KMeansClustering.__init__(self, data_sample, distance)
+       # cluster.KMeansClustering.__init__(self, data, distance)
         self.distanceMatrix = {}
         for submission in self._KMeansClustering__data:
             self.distanceMatrix[submission.user_id] = {}
@@ -335,6 +342,8 @@ class KMedoidsClustering(cluster.KMeansClustering):
         """
         #medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)]
         medoids_distances = []
+        logging.debug("initial length %s" % self._KMeansClustering__initial_length)
+        logging.debug("n %d" % n)
         for cluster in self.getclusters(n):
             type(cluster)
             print cluster
@@ -53,7 +53,7 @@ if __name__ == &#39;__main__&#39;:
         metrics.append(F1())
         metrics.append(Accuracy())
         metrics.append(SimpleAccuracy())
-        validation = CrossValidation(0.3,10,rec,metrics,0.005)
+        validation = CrossValidation(0.9,10,rec,metrics,0.1)
         validation.run(user)
         print validation
@@ -5,7 +5,8 @@ path = &#39;results&#39;
 experiment = 'grid'
 weight = ['bm25', 'trad']
 ;profile_size = range(10,100,10)
-sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+sample = [0.6, 0.7, 0.8, 0.9]
 [content]
 strategy = ['cb','cbt','cbd']
@@ -15,20 +15,32 @@ $var jsfiles: static/js/facebox.js
 <div id="maincontent">
 <div class="innertube">
-<a rel="facebox" href="/static/images/diaappr.png" title="AppRecommender Diagram"><img style="float: right; margin: 10px;" alt="AppRecommender Diagram" src="/static/images/diaappr.png" width="230px" /></a>
+<h1>AppRecommender Survey</h1>
+<h2>About</h2>
-<h1>About</h1>
-
-<p>This experiment aims to compare and validate automated application
-recommendations produced by various strategies and algorithms tunnings. We
+<p align="justify">This experiment aims to compare and validate automated application
+recommendations produced by various strategies and algorithms tuning. We
 believe that real users evaluation regarding the relevance of recommendations is
 the most accurate data source for computing recommender system effectiveness.</p>
-<br />
-<p>The engine that is being tested is a free software called <a
+<br/>
+
+<a rel="facebox" href="/static/images/diaappr.png" title="AppRecommender Diagram">
+<img style="float: right; margin: 10px;" alt="AppRecommender dataflow"
+src="/static/images/diaappr.png" width="230px" /></a>
+
+<p align="justify">The engine that is being tested is a free software called <a
 href="http://github.com/tassia/AppRecommender">AppRecommender</a>. It was
 initially developed using the Debian Project infrasctructure, but the solution
-is essentially distro-independent and could even be adapted to non GNU/Linux
-systems given that there was available data for that.</p>
+is essentially distro-independent and can even be adapted to non GNU/Linux
+systems given that there is available data for that.</p>
+<br />
+
+<p align="justify">The picture on the right gives an idea of the data workflow
+for AppRecommender. The user provides a set of applications installed in his
+system and the recommender suggestes a set of applications that he might also
+be interested in. Different strategies can be used to compose the recommendation,
+based on this user and other similar users profiles, using Apt-xapian-index,
+Popcon and UDD as data sources.</p>
 </div><!-- id="innertube" -->
 </div><!-- id="maincontent" -->
 $def with (content)
-$ url_base = "http://localhost:8080"
+$ url_base = "/"
 <!--Force IE6 into quirks mode with this comment tag-->
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
@@ -162,7 +162,7 @@ $:content
   <div id="navbar">
      <ul>
 	<li><a href="$url_base">Home</a></li>
-	<li><a href="$url_base/about">About</a></li>
+	<li><a href="$(url_base)about">About</a></li>
 	<li><a href="http://github.com/tassia/AppRecommender">Development</a></li>
     </ul>
    </div><!-- id="navbar" -->
@@ -28,7 +28,11 @@ $var jsfiles: static/coda-slider-2.0/javascripts/jquery-1.3.2.min.js static/coda
 <div class="show-end" style="display: none;">
 <p>
-A class <b>show-end</b> ou <b>hide-end</b> mostra um elemento ou esconde um elemento ao final do último Next.
+<!--A class <b>show-end</b> ou <b>hide-end</b> mostra um elemento ou esconde um
+elemento ao final do último Next.-->
+You have completed this round of evaluations. If you have time to do some more,
+please do so. Otherwise, please conclude your participation clicking in the
+button below.
 </p>
 </div>
@@ -28,8 +28,8 @@ field of each line. For instance, you can run the following command and upload t
 generated 'packages.list' file.</p>
 <p><code> # dpkg-query --show > packages.list </code></p>
 <p>Given the produced recommendations you will be asked to evaluate the list of
-applications suggested. You need to analyse at least 10 sugestions to be considered
-in the survey, though we appreciate if you do as many as you can.</p>
+applications suggested. You need to analyse at least 10 sugestions to be
+considered in the survey, though we appreciate if you do as many as you can.</p>
 <br />
 <p>Your help is very much appreciated!</p>
 </div>
	@@ -53,7 +53,7 @@ if __name__ == '__main__':		@@ -53,7 +53,7 @@ if __name__ == '__main__':
53	metrics.append(F1())	53	metrics.append(F1())
54	metrics.append(Accuracy())	54	metrics.append(Accuracy())
55	metrics.append(SimpleAccuracy())	55	metrics.append(SimpleAccuracy())
56	- validation = CrossValidation(0.3,10,rec,metrics,0.005)	56	+ validation = CrossValidation(0.9,10,rec,metrics,0.1)
57	validation.run(user)	57	validation.run(user)
58	print validation	58	print validation
59		59
1	$def with (content)	1	$def with (content)
2	-$ url_base = "http://localhost:8080"	2	+$ url_base = "/"
3	<!--Force IE6 into quirks mode with this comment tag-->	3	<!--Force IE6 into quirks mode with this comment tag-->
4	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"	4	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">	5	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
	@@ -162,7 +162,7 @@ $:content		@@ -162,7 +162,7 @@ $:content
162	<div id="navbar">	162	<div id="navbar">
163	<ul>	163	<ul>
164	<li><a href="$url_base">Home</a></li>	164	<li><a href="$url_base">Home</a></li>
165	- <li><a href="$url_base/about">About</a></li>	165	+ <li><a href="$(url_base)about">About</a></li>
166	<li><a href="http://github.com/tassia/AppRecommender">Development</a></li>	166	<li><a href="http://github.com/tassia/AppRecommender">Development</a></li>
167	</ul>	167	</ul>
168	</div><!-- id="navbar" -->	168	</div><!-- id="navbar" -->