Commit e2abc449dea04bc05cdbccd8602bb3dbe4a15bf8
1 parent
91384b32
Exists in
master
and in
1 other branch
Added arch info to popcon and fixed standard deviation bug.
Showing
1 changed file
with
8 additions
and
1 deletions
Show diff stats
src/data.py
@@ -122,7 +122,11 @@ def tfidf_plus(index,docs,content_filter): | @@ -122,7 +122,11 @@ def tfidf_plus(index,docs,content_filter): | ||
122 | variance = sum([(p-mean)*(p-mean) for p in population])/len(population) | 122 | variance = sum([(p-mean)*(p-mean) for p in population])/len(population) |
123 | standard_deviation = math.sqrt(variance) | 123 | standard_deviation = math.sqrt(variance) |
124 | for d in docs: | 124 | for d in docs: |
125 | - normalized_weigths[d.docid] = d.weight/standard_deviation | 125 | + if standard_deviation>1: |
126 | + # values between [0-1] would cause the opposite effect | ||
127 | + normalized_weigths[d.docid] = d.weight/standard_deviation | ||
128 | + else: | ||
129 | + normalized_weigths[d.docid] = d.weight | ||
126 | return tfidf_weighting(index,docs,content_filter,normalized_weigths) | 130 | return tfidf_weighting(index,docs,content_filter,normalized_weigths) |
127 | 131 | ||
128 | class FilteredXapianIndex(xapian.WritableDatabase): | 132 | class FilteredXapianIndex(xapian.WritableDatabase): |
@@ -298,6 +302,7 @@ class PopconSubmission(): | @@ -298,6 +302,7 @@ class PopconSubmission(): | ||
298 | for line in submission: | 302 | for line in submission: |
299 | if line.startswith("POPULARITY"): | 303 | if line.startswith("POPULARITY"): |
300 | self.user_id = line.split()[2].lstrip("ID:") | 304 | self.user_id = line.split()[2].lstrip("ID:") |
305 | + self.arch = line.split()[3].lstrip("ARCH:") | ||
301 | elif not line.startswith("END-POPULARITY"): | 306 | elif not line.startswith("END-POPULARITY"): |
302 | data = line.rstrip('\n').split() | 307 | data = line.rstrip('\n').split() |
303 | if len(data) > 2: | 308 | if len(data) > 2: |
@@ -371,6 +376,8 @@ class FilteredPopconXapianIndex(xapian.WritableDatabase): | @@ -371,6 +376,8 @@ class FilteredPopconXapianIndex(xapian.WritableDatabase): | ||
371 | (submission.user_id,len(submission_pkgs))) | 376 | (submission.user_id,len(submission_pkgs))) |
372 | else: | 377 | else: |
373 | doc.set_data(submission.user_id) | 378 | doc.set_data(submission.user_id) |
379 | + doc.add_term("ID"+submission.user_id) | ||
380 | + doc.add_term("ARCH"+submission.arch) | ||
374 | logging.debug("Parsing popcon submission \'%s\'" % | 381 | logging.debug("Parsing popcon submission \'%s\'" % |
375 | submission.user_id) | 382 | submission.user_id) |
376 | for pkg,freq in submission_pkgs.items(): | 383 | for pkg,freq in submission_pkgs.items(): |