Commit e2abc449dea04bc05cdbccd8602bb3dbe4a15bf8
1 parent
91384b32
Exists in
master
and in
1 other branch
Added arch info to popcon and fixed standard deviation bug.
Showing
1 changed file
with
8 additions
and
1 deletions
Show diff stats
src/data.py
| ... | ... | @@ -122,7 +122,11 @@ def tfidf_plus(index,docs,content_filter): |
| 122 | 122 | variance = sum([(p-mean)*(p-mean) for p in population])/len(population) |
| 123 | 123 | standard_deviation = math.sqrt(variance) |
| 124 | 124 | for d in docs: |
| 125 | - normalized_weigths[d.docid] = d.weight/standard_deviation | |
| 125 | + if standard_deviation>1: | |
| 126 | + # values between [0-1] would cause the opposite effect | |
| 127 | + normalized_weigths[d.docid] = d.weight/standard_deviation | |
| 128 | + else: | |
| 129 | + normalized_weigths[d.docid] = d.weight | |
| 126 | 130 | return tfidf_weighting(index,docs,content_filter,normalized_weigths) |
| 127 | 131 | |
| 128 | 132 | class FilteredXapianIndex(xapian.WritableDatabase): |
| ... | ... | @@ -298,6 +302,7 @@ class PopconSubmission(): |
| 298 | 302 | for line in submission: |
| 299 | 303 | if line.startswith("POPULARITY"): |
| 300 | 304 | self.user_id = line.split()[2].lstrip("ID:") |
| 305 | + self.arch = line.split()[3].lstrip("ARCH:") | |
| 301 | 306 | elif not line.startswith("END-POPULARITY"): |
| 302 | 307 | data = line.rstrip('\n').split() |
| 303 | 308 | if len(data) > 2: |
| ... | ... | @@ -371,6 +376,8 @@ class FilteredPopconXapianIndex(xapian.WritableDatabase): |
| 371 | 376 | (submission.user_id,len(submission_pkgs))) |
| 372 | 377 | else: |
| 373 | 378 | doc.set_data(submission.user_id) |
| 379 | + doc.add_term("ID"+submission.user_id) | |
| 380 | + doc.add_term("ARCH"+submission.arch) | |
| 374 | 381 | logging.debug("Parsing popcon submission \'%s\'" % |
| 375 | 382 | submission.user_id) |
| 376 | 383 | for pkg,freq in submission_pkgs.items(): | ... | ... |