Commit e2abc449dea04bc05cdbccd8602bb3dbe4a15bf8
1 parent
91384b32
Exists in
master
and in
1 other branch
Added arch info to popcon and fixed standard deviation bug.
Showing
1 changed file
with
8 additions
and
1 deletions
Show diff stats
src/data.py
... | ... | @@ -122,7 +122,11 @@ def tfidf_plus(index,docs,content_filter): |
122 | 122 | variance = sum([(p-mean)*(p-mean) for p in population])/len(population) |
123 | 123 | standard_deviation = math.sqrt(variance) |
124 | 124 | for d in docs: |
125 | - normalized_weigths[d.docid] = d.weight/standard_deviation | |
125 | + if standard_deviation>1: | |
126 | + # values between [0-1] would cause the opposite effect | |
127 | + normalized_weigths[d.docid] = d.weight/standard_deviation | |
128 | + else: | |
129 | + normalized_weigths[d.docid] = d.weight | |
126 | 130 | return tfidf_weighting(index,docs,content_filter,normalized_weigths) |
127 | 131 | |
128 | 132 | class FilteredXapianIndex(xapian.WritableDatabase): |
... | ... | @@ -298,6 +302,7 @@ class PopconSubmission(): |
298 | 302 | for line in submission: |
299 | 303 | if line.startswith("POPULARITY"): |
300 | 304 | self.user_id = line.split()[2].lstrip("ID:") |
305 | + self.arch = line.split()[3].lstrip("ARCH:") | |
301 | 306 | elif not line.startswith("END-POPULARITY"): |
302 | 307 | data = line.rstrip('\n').split() |
303 | 308 | if len(data) > 2: |
... | ... | @@ -371,6 +376,8 @@ class FilteredPopconXapianIndex(xapian.WritableDatabase): |
371 | 376 | (submission.user_id,len(submission_pkgs))) |
372 | 377 | else: |
373 | 378 | doc.set_data(submission.user_id) |
379 | + doc.add_term("ID"+submission.user_id) | |
380 | + doc.add_term("ARCH"+submission.arch) | |
374 | 381 | logging.debug("Parsing popcon submission \'%s\'" % |
375 | 382 | submission.user_id) |
376 | 383 | for pkg,freq in submission_pkgs.items(): | ... | ... |