Commit e2abc449dea04bc05cdbccd8602bb3dbe4a15bf8

Authored by Tássia Camões Araújo
1 parent 91384b32
Exists in master and in 1 other branch add_vagrant

Added arch info to popcon and fixed standard deviation bug.

Showing 1 changed file with 8 additions and 1 deletions   Show diff stats
src/data.py
... ... @@ -122,7 +122,11 @@ def tfidf_plus(index,docs,content_filter):
122 122 variance = sum([(p-mean)*(p-mean) for p in population])/len(population)
123 123 standard_deviation = math.sqrt(variance)
124 124 for d in docs:
125   - normalized_weigths[d.docid] = d.weight/standard_deviation
  125 + if standard_deviation>1:
  126 + # values between [0-1] would cause the opposite effect
  127 + normalized_weigths[d.docid] = d.weight/standard_deviation
  128 + else:
  129 + normalized_weigths[d.docid] = d.weight
126 130 return tfidf_weighting(index,docs,content_filter,normalized_weigths)
127 131  
128 132 class FilteredXapianIndex(xapian.WritableDatabase):
... ... @@ -298,6 +302,7 @@ class PopconSubmission():
298 302 for line in submission:
299 303 if line.startswith("POPULARITY"):
300 304 self.user_id = line.split()[2].lstrip("ID:")
  305 + self.arch = line.split()[3].lstrip("ARCH:")
301 306 elif not line.startswith("END-POPULARITY"):
302 307 data = line.rstrip('\n').split()
303 308 if len(data) > 2:
... ... @@ -371,6 +376,8 @@ class FilteredPopconXapianIndex(xapian.WritableDatabase):
371 376 (submission.user_id,len(submission_pkgs)))
372 377 else:
373 378 doc.set_data(submission.user_id)
  379 + doc.add_term("ID"+submission.user_id)
  380 + doc.add_term("ARCH"+submission.arch)
374 381 logging.debug("Parsing popcon submission \'%s\'" %
375 382 submission.user_id)
376 383 for pkg,freq in submission_pkgs.items():
... ...