Commit e2abc449dea04bc05cdbccd8602bb3dbe4a15bf8

Authored by Tássia Camões Araújo
1 parent 91384b32
Exists in master and in 1 other branch add_vagrant

Added arch info to popcon and fixed standard deviation bug.

Showing 1 changed file with 8 additions and 1 deletions   Show diff stats
@@ -122,7 +122,11 @@ def tfidf_plus(index,docs,content_filter): @@ -122,7 +122,11 @@ def tfidf_plus(index,docs,content_filter):
122 variance = sum([(p-mean)*(p-mean) for p in population])/len(population) 122 variance = sum([(p-mean)*(p-mean) for p in population])/len(population)
123 standard_deviation = math.sqrt(variance) 123 standard_deviation = math.sqrt(variance)
124 for d in docs: 124 for d in docs:
125 - normalized_weigths[d.docid] = d.weight/standard_deviation 125 + if standard_deviation>1:
  126 + # values between [0-1] would cause the opposite effect
  127 + normalized_weigths[d.docid] = d.weight/standard_deviation
  128 + else:
  129 + normalized_weigths[d.docid] = d.weight
126 return tfidf_weighting(index,docs,content_filter,normalized_weigths) 130 return tfidf_weighting(index,docs,content_filter,normalized_weigths)
127 131
128 class FilteredXapianIndex(xapian.WritableDatabase): 132 class FilteredXapianIndex(xapian.WritableDatabase):
@@ -298,6 +302,7 @@ class PopconSubmission(): @@ -298,6 +302,7 @@ class PopconSubmission():
298 for line in submission: 302 for line in submission:
299 if line.startswith("POPULARITY"): 303 if line.startswith("POPULARITY"):
300 self.user_id = line.split()[2].lstrip("ID:") 304 self.user_id = line.split()[2].lstrip("ID:")
  305 + self.arch = line.split()[3].lstrip("ARCH:")
301 elif not line.startswith("END-POPULARITY"): 306 elif not line.startswith("END-POPULARITY"):
302 data = line.rstrip('\n').split() 307 data = line.rstrip('\n').split()
303 if len(data) > 2: 308 if len(data) > 2:
@@ -371,6 +376,8 @@ class FilteredPopconXapianIndex(xapian.WritableDatabase): @@ -371,6 +376,8 @@ class FilteredPopconXapianIndex(xapian.WritableDatabase):
371 (submission.user_id,len(submission_pkgs))) 376 (submission.user_id,len(submission_pkgs)))
372 else: 377 else:
373 doc.set_data(submission.user_id) 378 doc.set_data(submission.user_id)
  379 + doc.add_term("ID"+submission.user_id)
  380 + doc.add_term("ARCH"+submission.arch)
374 logging.debug("Parsing popcon submission \'%s\'" % 381 logging.debug("Parsing popcon submission \'%s\'" %
375 submission.user_id) 382 submission.user_id)
376 for pkg,freq in submission_pkgs.items(): 383 for pkg,freq in submission_pkgs.items():