From e2abc449dea04bc05cdbccd8602bb3dbe4a15bf8 Mon Sep 17 00:00:00 2001 From: Tássia Camões Araújo Date: Mon, 12 Sep 2011 04:58:15 +0000 Subject: [PATCH] Added arch info to popcon and fixed standard deviation bug. --- src/data.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/data.py b/src/data.py index e435897..edee10f 100644 --- a/src/data.py +++ b/src/data.py @@ -122,7 +122,11 @@ def tfidf_plus(index,docs,content_filter): variance = sum([(p-mean)*(p-mean) for p in population])/len(population) standard_deviation = math.sqrt(variance) for d in docs: - normalized_weigths[d.docid] = d.weight/standard_deviation + if standard_deviation>1: + # values between [0-1] would cause the opposite effect + normalized_weigths[d.docid] = d.weight/standard_deviation + else: + normalized_weigths[d.docid] = d.weight return tfidf_weighting(index,docs,content_filter,normalized_weigths) class FilteredXapianIndex(xapian.WritableDatabase): @@ -298,6 +302,7 @@ class PopconSubmission(): for line in submission: if line.startswith("POPULARITY"): self.user_id = line.split()[2].lstrip("ID:") + self.arch = line.split()[3].lstrip("ARCH:") elif not line.startswith("END-POPULARITY"): data = line.rstrip('\n').split() if len(data) > 2: @@ -371,6 +376,8 @@ class FilteredPopconXapianIndex(xapian.WritableDatabase): (submission.user_id,len(submission_pkgs))) else: doc.set_data(submission.user_id) + doc.add_term("ID"+submission.user_id) + doc.add_term("ARCH"+submission.arch) logging.debug("Parsing popcon submission \'%s\'" % submission.user_id) for pkg,freq in submission_pkgs.items(): -- libgit2 0.21.2