Commit 11fc8731a1246b3f06a729395cdfefcc9bc0322c
Exists in
master
and in
1 other branch
Merging content of survey index.
Showing
8 changed files
with
69 additions
and
37 deletions
Show diff stats
src/config.py
... | ... | @@ -46,6 +46,7 @@ class Config(): |
46 | 46 | self.popcon_dir = os.path.expanduser("~/.app-recommender/popcon_dir") |
47 | 47 | self.clusters_dir = os.path.expanduser("~/.app-recommender/clusters_dir") |
48 | 48 | self.k_medoids = 100 |
49 | + self.max_popcon = 1000 | |
49 | 50 | self.index_mode = "old" |
50 | 51 | self.strategy = "cb" |
51 | 52 | self.weight = "bm25" |
... | ... | @@ -71,6 +72,7 @@ class Config(): |
71 | 72 | print " -u, --indexmode= 'old'|'reindex'|'cluster'|'recluster'" |
72 | 73 | print " -l, --clustersdir=PATH Path to popcon clusters dir" |
73 | 74 | print " -c, --medoids=k Number of medoids for clustering" |
75 | + print " -x, --maxpopcon=k Number of submissions to be considered" | |
74 | 76 | print "" |
75 | 77 | print " [ recommender ]" |
76 | 78 | print " -w, --weight=OPTION Search weighting scheme" |
... | ... | @@ -112,8 +114,8 @@ class Config(): |
112 | 114 | logging.error("Error in config file syntax: %s", str(err)) |
113 | 115 | os.abort() |
114 | 116 | |
115 | - self.debug = self.read_option('general', 'debug') | |
116 | - self.debug = self.read_option('general', 'verbose') | |
117 | + self.debug = int(self.read_option('general', 'debug')) | |
118 | + self.debug = int(self.read_option('general', 'verbose')) | |
117 | 119 | self.output_filename = self.read_option('general', 'output') |
118 | 120 | self.survey_mode = self.read_option('general', 'survey_mode') |
119 | 121 | |
... | ... | @@ -123,16 +125,18 @@ class Config(): |
123 | 125 | self.popcon_dir = os.path.expanduser(self.read_option('data_sources', 'popcon_dir')) |
124 | 126 | self.index_mode = self.read_option('data_sources', 'index_mode') |
125 | 127 | self.clusters_dir = os.path.expanduser(self.read_option('data_sources', 'clusters_dir')) |
126 | - self.k_medoids = self.read_option('data_sources', 'k_medoids') | |
128 | + self.k_medoids = int(self.read_option('data_sources', 'k_medoids')) | |
129 | + self.max_popcon = int(self.read_option('data_sources', 'max_popcon')) | |
127 | 130 | |
128 | 131 | self.weight = self.read_option('recommender', 'weight') |
129 | 132 | self.strategy = self.read_option('recommender', 'strategy') |
130 | - self.profile_size = self.read_option('recommender', 'profile_size') | |
133 | + self.profile_size = int(self.read_option('recommender', | |
134 | + 'profile_size')) | |
131 | 135 | |
132 | - short_options = "hdvo:a:e:p:m:ul:c:w:s:z:" | |
136 | + short_options = "hdvo:a:e:p:m:ul:c:x:w:s:z:" | |
133 | 137 | long_options = ["help", "debug", "verbose", "output=", |
134 | 138 | "axi=", "dde=", "popconindex=", "popcondir=", "indexmode=", |
135 | - "clustersdir=", "kmedoids=", "weight=", "strategy=", | |
139 | + "clustersdir=", "kmedoids=", "max_popcon=", "weight=", "strategy=", | |
136 | 140 | "profile_size="] |
137 | 141 | try: |
138 | 142 | opts, args = getopt.getopt(sys.argv[1:], short_options, |
... | ... | @@ -166,13 +170,15 @@ class Config(): |
166 | 170 | elif o in ("-l", "--clustersdir"): |
167 | 171 | self.clusters_dir = p |
168 | 172 | elif o in ("-c", "--kmedoids"): |
169 | - self.k_medoids = p | |
173 | + self.k_medoids = int(p) | |
174 | + elif o in ("-x", "--max_popcon"): | |
175 | + self.max_popcon = int(p) | |
170 | 176 | elif o in ("-w", "--weight"): |
171 | 177 | self.weight = p |
172 | 178 | elif o in ("-s", "--strategy"): |
173 | 179 | self.strategy = p |
174 | 180 | elif o in ("-z", "--profile_size"): |
175 | - self.strategy = p | |
181 | + self.strategy = int(p) | |
176 | 182 | else: |
177 | 183 | assert False, "unhandled option" |
178 | 184 | ... | ... |
src/data.py
... | ... | @@ -82,7 +82,7 @@ class AppAptXapianIndex(xapian.WritableDatabase): |
82 | 82 | except: |
83 | 83 | logging.info("Doc %d not found in axi." % docid) |
84 | 84 | logging.info("AppAptXapianIndex size: %d (lastdocid: %d)." % |
85 | - self.get_doccount(), self.get_lastdocid()) | |
85 | + (self.get_doccount(), self.get_lastdocid())) | |
86 | 86 | |
87 | 87 | def __str__(self): |
88 | 88 | return print_index(self) |
... | ... | @@ -166,6 +166,7 @@ class PopconXapianIndex(xapian.WritableDatabase): |
166 | 166 | raise Error |
167 | 167 | if cfg.index_mode == "reindex": |
168 | 168 | self.source_dir = os.path.expanduser(cfg.popcon_dir) |
169 | + logging.debug(self.source_dir) | |
169 | 170 | else: |
170 | 171 | self.source_dir = os.path.expanduser(cfg.clusters_dir) |
171 | 172 | if not os.path.exists(cfg.clusters_dir): |
... | ... | @@ -180,10 +181,12 @@ class PopconXapianIndex(xapian.WritableDatabase): |
180 | 181 | % cfg.clusters_dir) |
181 | 182 | distance = JaccardDistance() |
182 | 183 | data = self.get_submissions(cfg.popcon_dir) |
184 | + logging.debug(type(data)) | |
183 | 185 | self.cluster_dispersion = \ |
184 | 186 | self.kmedoids_clustering(data, cfg.clusters_dir, |
185 | - distance, cfg.k_medoids) | |
186 | - logging.info("Clusters dispersion: %f.2", | |
187 | + distance, cfg.k_medoids, | |
188 | + cfg.max_popcon) | |
189 | + logging.info("Clusters dispersion: %.2f", | |
187 | 190 | self.cluster_dispersion) |
188 | 191 | else: |
189 | 192 | logging.info("Using clusters from \'%s\'" % |
... | ... | @@ -221,8 +224,9 @@ class PopconXapianIndex(xapian.WritableDatabase): |
221 | 224 | self.path) |
222 | 225 | xapian.WritableDatabase.__init__(self,self.path, |
223 | 226 | xapian.DB_CREATE_OR_OVERWRITE) |
224 | - except xapian.DatabaseError: | |
227 | + except xapian.DatabaseError as e: | |
225 | 228 | logging.critical("Could not create popcon xapian index.") |
229 | + logging.critical(str(e)) | |
226 | 230 | raise Error |
227 | 231 | |
228 | 232 | for root, dirs, files in os.walk(self.source_dir): |
... | ... | @@ -254,29 +258,32 @@ class PopconXapianIndex(xapian.WritableDatabase): |
254 | 258 | submissions = [] |
255 | 259 | for root, dirs, files in os.walk(submissions_dir): |
256 | 260 | for popcon_file in files: |
261 | + logging.debug("Parsing submission %s" % popcon_file) | |
257 | 262 | submission = PopconSubmission(os.path.join(root, popcon_file)) |
258 | 263 | submissions.append(submission) |
259 | 264 | return submissions |
260 | 265 | |
261 | - def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids): | |
266 | + def kmedoids_clustering(self,data,clusters_dir,distance,k_medoids,max_popcon): | |
262 | 267 | clusters = KMedoidsClustering(data,lambda x,y: |
263 | 268 | distance(x.packages.keys(), |
264 | - y.packages.keys())) | |
269 | + y.packages.keys()),max_popcon) | |
265 | 270 | medoids,dispersion = clusters.getMedoids(k_medoids) |
266 | 271 | for submission in medoids: |
272 | + logging.debug("Copying submission %s" % submission.user_id) | |
267 | 273 | shutil.copyfile(submission.path,os.path.join(clusters_dir, |
268 | 274 | submission.user_id)) |
269 | 275 | return dispersion |
270 | 276 | |
271 | 277 | class KMedoidsClustering(cluster.KMeansClustering): |
272 | 278 | |
273 | - def __init__(self,data,distance,max_data=100): | |
274 | - # if len(data)<max_data: | |
275 | - # data_sample = data | |
276 | - # else: | |
277 | - # data_sample = random.sample(data,max_data) | |
278 | - # cluster.KMeansClustering.__init__(self, data_sample, distance) | |
279 | - cluster.KMeansClustering.__init__(self, data, distance) | |
279 | + def __init__(self,data,distance,max_data): | |
280 | + if len(data)<max_data: | |
281 | + data_sample = data | |
282 | + else: | |
283 | + data_sample = random.sample(data,max_data) | |
284 | + print data_sample | |
285 | + cluster.KMeansClustering.__init__(self, data_sample, distance) | |
286 | + # cluster.KMeansClustering.__init__(self, data, distance) | |
280 | 287 | self.distanceMatrix = {} |
281 | 288 | for submission in self._KMeansClustering__data: |
282 | 289 | self.distanceMatrix[submission.user_id] = {} |
... | ... | @@ -335,6 +342,8 @@ class KMedoidsClustering(cluster.KMeansClustering): |
335 | 342 | """ |
336 | 343 | #medoids_distances = [self.getMedoid(cluster) for cluster in self.getclusters(n)] |
337 | 344 | medoids_distances = [] |
345 | + logging.debug("initial length %s" % self._KMeansClustering__initial_length) | |
346 | + logging.debug("n %d" % n) | |
338 | 347 | for cluster in self.getclusters(n): |
339 | 348 | type(cluster) |
340 | 349 | print cluster | ... | ... |
src/examples/cross_validation.py
... | ... | @@ -53,7 +53,7 @@ if __name__ == '__main__': |
53 | 53 | metrics.append(F1()) |
54 | 54 | metrics.append(Accuracy()) |
55 | 55 | metrics.append(SimpleAccuracy()) |
56 | - validation = CrossValidation(0.3,10,rec,metrics,0.005) | |
56 | + validation = CrossValidation(0.9,10,rec,metrics,0.1) | |
57 | 57 | validation.run(user) |
58 | 58 | print validation |
59 | 59 | ... | ... |
src/experiments/experiments.cfg
... | ... | @@ -5,7 +5,8 @@ path = 'results' |
5 | 5 | experiment = 'grid' |
6 | 6 | weight = ['bm25', 'trad'] |
7 | 7 | ;profile_size = range(10,100,10) |
8 | -sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] | |
8 | +;sample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] | |
9 | +sample = [0.6, 0.7, 0.8, 0.9] | |
9 | 10 | |
10 | 11 | [content] |
11 | 12 | strategy = ['cb','cbt','cbd'] | ... | ... |
src/web/templates/about.html
... | ... | @@ -15,20 +15,32 @@ $var jsfiles: static/js/facebox.js |
15 | 15 | <div id="maincontent"> |
16 | 16 | <div class="innertube"> |
17 | 17 | |
18 | -<a rel="facebox" href="/static/images/diaappr.png" title="AppRecommender Diagram"><img style="float: right; margin: 10px;" alt="AppRecommender Diagram" src="/static/images/diaappr.png" width="230px" /></a> | |
18 | +<h1>AppRecommender Survey</h1> | |
19 | +<h2>About</h2> | |
19 | 20 | |
20 | -<h1>About</h1> | |
21 | - | |
22 | -<p>This experiment aims to compare and validate automated application | |
23 | -recommendations produced by various strategies and algorithms tunnings. We | |
21 | +<p align="justify">This experiment aims to compare and validate automated application | |
22 | +recommendations produced by various strategies and algorithms tuning. We | |
24 | 23 | believe that real users evaluation regarding the relevance of recommendations is |
25 | 24 | the most accurate data source for computing recommender system effectiveness.</p> |
26 | -<br /> | |
27 | -<p>The engine that is being tested is a free software called <a | |
25 | +<br/> | |
26 | + | |
27 | +<a rel="facebox" href="/static/images/diaappr.png" title="AppRecommender Diagram"> | |
28 | +<img style="float: right; margin: 10px;" alt="AppRecommender dataflow" | |
29 | +src="/static/images/diaappr.png" width="230px" /></a> | |
30 | + | |
31 | +<p align="justify">The engine that is being tested is a free software called <a | |
28 | 32 | href="http://github.com/tassia/AppRecommender">AppRecommender</a>. It was |
29 | 33 | initially developed using the Debian Project infrasctructure, but the solution |
30 | -is essentially distro-independent and could even be adapted to non GNU/Linux | |
31 | -systems given that there was available data for that.</p> | |
34 | +is essentially distro-independent and can even be adapted to non GNU/Linux | |
35 | +systems given that there is available data for that.</p> | |
36 | +<br /> | |
37 | + | |
38 | +<p align="justify">The picture on the right gives an idea of the data workflow | |
39 | +for AppRecommender. The user provides a set of applications installed in his | |
40 | +system and the recommender suggestes a set of applications that he might also | |
41 | +be interested in. Different strategies can be used to compose the recommendation, | |
42 | +based on this user and other similar users profiles, using Apt-xapian-index, | |
43 | +Popcon and UDD as data sources.</p> | |
32 | 44 | |
33 | 45 | </div><!-- id="innertube" --> |
34 | 46 | </div><!-- id="maincontent" --> | ... | ... |
src/web/templates/layout.html
1 | 1 | $def with (content) |
2 | -$ url_base = "http://localhost:8080" | |
2 | +$ url_base = "/" | |
3 | 3 | <!--Force IE6 into quirks mode with this comment tag--> |
4 | 4 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" |
5 | 5 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
... | ... | @@ -162,7 +162,7 @@ $:content |
162 | 162 | <div id="navbar"> |
163 | 163 | <ul> |
164 | 164 | <li><a href="$url_base">Home</a></li> |
165 | - <li><a href="$url_base/about">About</a></li> | |
165 | + <li><a href="$(url_base)about">About</a></li> | |
166 | 166 | <li><a href="http://github.com/tassia/AppRecommender">Development</a></li> |
167 | 167 | </ul> |
168 | 168 | </div><!-- id="navbar" --> | ... | ... |
src/web/templates/survey.html
... | ... | @@ -28,7 +28,11 @@ $var jsfiles: static/coda-slider-2.0/javascripts/jquery-1.3.2.min.js static/coda |
28 | 28 | <div class="show-end" style="display: none;"> |
29 | 29 | |
30 | 30 | <p> |
31 | -A class <b>show-end</b> ou <b>hide-end</b> mostra um elemento ou esconde um elemento ao final do último Next. | |
31 | +<!--A class <b>show-end</b> ou <b>hide-end</b> mostra um elemento ou esconde um | |
32 | +elemento ao final do último Next.--> | |
33 | +You have completed this round of evaluations. If you have time to do some more, | |
34 | +please do so. Otherwise, please conclude your participation clicking in the | |
35 | +button below. | |
32 | 36 | </p> |
33 | 37 | |
34 | 38 | </div> | ... | ... |
src/web/templates/survey_index.html
... | ... | @@ -28,8 +28,8 @@ field of each line. For instance, you can run the following command and upload t |
28 | 28 | generated 'packages.list' file.</p> |
29 | 29 | <p><code> # dpkg-query --show > packages.list </code></p> |
30 | 30 | <p>Given the produced recommendations you will be asked to evaluate the list of |
31 | -applications suggested. You need to analyse at least 10 sugestions to be considered | |
32 | -in the survey, though we appreciate if you do as many as you can.</p> | |
31 | +applications suggested. You need to analyse at least 10 sugestions to be | |
32 | +considered in the survey, though we appreciate if you do as many as you can.</p> | |
33 | 33 | <br /> |
34 | 34 | <p>Your help is very much appreciated!</p> |
35 | 35 | </div> | ... | ... |