Commit 20ed507fdbd5166477c79aec137ae7bbde609ebf
1 parent
1365999b
Exists in
master
and in
22 other branches
[search-improvements] Normalizing occurrence and relevance scores
Showing
5 changed files
with
81 additions
and
20 deletions
Show diff stats
app/models/search_term.rb
@@ -8,26 +8,55 @@ class SearchTerm < ActiveRecord::Base | @@ -8,26 +8,55 @@ class SearchTerm < ActiveRecord::Base | ||
8 | attr_accessible :term, :context, :asset | 8 | attr_accessible :term, :context, :asset |
9 | 9 | ||
10 | def self.calculate_scores | 10 | def self.calculate_scores |
11 | - find_each { |search_term| search_term.calculate_score } | 11 | + os = occurrences_scores |
12 | + find_each { |search_term| search_term.calculate_score(os) } | ||
12 | end | 13 | end |
13 | 14 | ||
14 | def self.find_or_create(term, context, asset='all') | 15 | def self.find_or_create(term, context, asset='all') |
15 | context.search_terms.where(:term => term, :asset => asset).first || context.search_terms.create!(:term => term, :asset=> asset) | 16 | context.search_terms.where(:term => term, :asset => asset).first || context.search_terms.create!(:term => term, :asset=> asset) |
16 | end | 17 | end |
17 | 18 | ||
18 | - def calculate_score | 19 | + # Fast way of getting the occurrences score for each search_term. Ugly but fast! |
20 | + # | ||
21 | + # Each occurrence of a search_term has a score that is smaller the older the | ||
22 | + # occurrence happened. We subtract the amount of time between now and the | ||
23 | + # moment it happened from the total time any occurrence is valid to happen. E.g.: | ||
24 | + # The expiration time is 100 days and an occurrence happened 3 days ago. | ||
25 | + # Therefore the score is 97. Them we sum every score to get the total score | ||
26 | + # for a search term. | ||
27 | + def self.occurrences_scores | ||
28 | + ActiveSupport::OrderedHash[*ActiveRecord::Base.connection.execute( | ||
29 | + joins(:occurrences). | ||
30 | + select("search_terms.id, sum(#{SearchTermOccurrence::EXPIRATION_TIME.to_i} - extract(epoch from (now() - search_term_occurrences.created_at))) as value"). | ||
31 | + where("search_term_occurrences.created_at > ?", DateTime.now - SearchTermOccurrence::EXPIRATION_TIME). | ||
32 | + group("search_terms.id"). | ||
33 | + order('value DESC'). | ||
34 | + to_sql | ||
35 | + ).map {|result| [result['id'].to_i, result['value'].to_i]}.flatten] | ||
36 | + end | ||
37 | + | ||
38 | + def calculate_occurrence(occurrences_scores) | ||
39 | + max_score = occurrences_scores.first[1] | ||
40 | + (occurrences_scores[id]/max_score.to_f)*100 | ||
41 | + end | ||
42 | + | ||
43 | + def calculate_relevance(valid_occurrences) | ||
44 | + indexed = valid_occurrences.last.indexed.to_f | ||
45 | + total = valid_occurrences.last.total.to_f | ||
46 | + (1 - indexed/total)*100 | ||
47 | + end | ||
48 | + | ||
49 | + def calculate_score(occurrences_scores) | ||
19 | valid_occurrences = occurrences.valid | 50 | valid_occurrences = occurrences.valid |
20 | if valid_occurrences.present? | 51 | if valid_occurrences.present? |
21 | - indexed = valid_occurrences.last.indexed | ||
22 | - total = valid_occurrences.last.total | ||
23 | - # Using the formula described on this paper: http://www.soi.city.ac.uk/~ser/papers/RSJ76.pdf | ||
24 | - current_relevance = indexed > 0 && total >= indexed ? -Math.log(indexed.to_f/total.to_f) : 0 | ||
25 | - # Damp number of occurrences with log function to decrease it's effect over relevance. | ||
26 | - damped_occurrences = Math.log(valid_occurrences.count) | ||
27 | - self.score = (damped_occurrences * current_relevance).to_f | 52 | + # These scores vary from 1~100 |
53 | + self.occurrence_score = calculate_occurrence(occurrences_scores) | ||
54 | + self.relevance_score = calculate_relevance(valid_occurrences) | ||
28 | else | 55 | else |
29 | - self.score = 0 | 56 | + self.occurrence_score = 0 |
57 | + self.relevance_score = 0 | ||
30 | end | 58 | end |
59 | + self.score = (occurrence_score * relevance_score)/100.0 | ||
31 | self.save! | 60 | self.save! |
32 | end | 61 | end |
33 | end | 62 | end |
app/models/search_term_occurrence.rb
@@ -3,8 +3,7 @@ class SearchTermOccurrence < ActiveRecord::Base | @@ -3,8 +3,7 @@ class SearchTermOccurrence < ActiveRecord::Base | ||
3 | validates_presence_of :search_term | 3 | validates_presence_of :search_term |
4 | attr_accessible :search_term, :created_at, :total, :indexed | 4 | attr_accessible :search_term, :created_at, :total, :indexed |
5 | 5 | ||
6 | - #TODO Verify this value | ||
7 | - EXPIRATION_TIME = 1.month | 6 | + EXPIRATION_TIME = 1.year |
8 | 7 | ||
9 | - scope :valid, :conditions => ["search_term_occurrences.created_at >= ?", DateTime.now - EXPIRATION_TIME] | 8 | + scope :valid, :conditions => ["search_term_occurrences.created_at > ?", DateTime.now - EXPIRATION_TIME] |
10 | end | 9 | end |
db/migrate/20140507205338_create_search_terms.rb
@@ -5,13 +5,23 @@ class CreateSearchTerms < ActiveRecord::Migration | @@ -5,13 +5,23 @@ class CreateSearchTerms < ActiveRecord::Migration | ||
5 | t.references :context, :polymorphic => true | 5 | t.references :context, :polymorphic => true |
6 | t.string :asset, :default => 'all' | 6 | t.string :asset, :default => 'all' |
7 | t.float :score, :default => 0 | 7 | t.float :score, :default => 0 |
8 | + t.float :relevance_score, :default => 0 | ||
9 | + t.float :occurrence_score, :default => 0 | ||
8 | end | 10 | end |
9 | 11 | ||
10 | - add_index :search_terms, [:term, :asset, :score] | 12 | + add_index :search_terms, :term |
13 | + add_index :search_terms, :asset | ||
14 | + add_index :search_terms, :score | ||
15 | + add_index :search_terms, :relevance_score | ||
16 | + add_index :search_terms, :occurrence_score | ||
11 | end | 17 | end |
12 | 18 | ||
13 | def down | 19 | def down |
14 | - remove_index :search_terms, [:term, :asset, :score] | 20 | + remove_index :search_terms, :term |
21 | + remove_index :search_terms, :asset | ||
22 | + remove_index :search_terms, :score | ||
23 | + remove_index :search_terms, :relevance_score | ||
24 | + remove_index :search_terms, :occurrence_score | ||
15 | drop_table :search_terms | 25 | drop_table :search_terms |
16 | end | 26 | end |
17 | end | 27 | end |
db/schema.rb
@@ -563,11 +563,17 @@ ActiveRecord::Schema.define(:version => 20140507205338) do | @@ -563,11 +563,17 @@ ActiveRecord::Schema.define(:version => 20140507205338) do | ||
563 | t.string "term" | 563 | t.string "term" |
564 | t.integer "context_id" | 564 | t.integer "context_id" |
565 | t.string "context_type" | 565 | t.string "context_type" |
566 | - t.string "asset", :default => "all" | ||
567 | - t.float "score", :default => 0.0 | 566 | + t.string "asset", :default => "all" |
567 | + t.float "score", :default => 0.0 | ||
568 | + t.float "relevance_score", :default => 0.0 | ||
569 | + t.float "occurrence_score", :default => 0.0 | ||
568 | end | 570 | end |
569 | 571 | ||
570 | - add_index "search_terms", ["term", "asset", "score"], :name => "index_search_terms_on_term_and_asset_and_score" | 572 | + add_index "search_terms", ["asset"], :name => "index_search_terms_on_asset" |
573 | + add_index "search_terms", ["occurrence_score"], :name => "index_search_terms_on_occurrence_score" | ||
574 | + add_index "search_terms", ["relevance_score"], :name => "index_search_terms_on_relevance_score" | ||
575 | + add_index "search_terms", ["score"], :name => "index_search_terms_on_score" | ||
576 | + add_index "search_terms", ["term"], :name => "index_search_terms_on_term" | ||
571 | 577 | ||
572 | create_table "sessions", :force => true do |t| | 578 | create_table "sessions", :force => true do |t| |
573 | t.string "session_id", :null => false | 579 | t.string "session_id", :null => false |
test/unit/search_term_test.rb
@@ -51,14 +51,16 @@ class SearchTermTest < ActiveSupport::TestCase | @@ -51,14 +51,16 @@ class SearchTermTest < ActiveSupport::TestCase | ||
51 | SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) | 51 | SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) |
52 | # Search term must happens at least two times to be considered | 52 | # Search term must happens at least two times to be considered |
53 | SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) | 53 | SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) |
54 | - search_term.calculate_score | 54 | + SearchTerm.calculate_scores |
55 | + search_term.reload | ||
55 | assert search_term.score > 0, "Score was not calculated." | 56 | assert search_term.score > 0, "Score was not calculated." |
56 | end | 57 | end |
57 | 58 | ||
58 | should 'not consider expired occurrences to calculate the score' do | 59 | should 'not consider expired occurrences to calculate the score' do |
59 | search_term = SearchTerm.find_or_create('universe', Environment.default) | 60 | search_term = SearchTerm.find_or_create('universe', Environment.default) |
60 | occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) | 61 | occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) |
61 | - search_term.calculate_score | 62 | + SearchTerm.calculate_scores |
63 | + search_term.reload | ||
62 | assert search_term.score == 0, "Considered expired occurrence to calculate the score." | 64 | assert search_term.score == 0, "Considered expired occurrence to calculate the score." |
63 | end | 65 | end |
64 | 66 | ||
@@ -80,4 +82,19 @@ class SearchTermTest < ActiveSupport::TestCase | @@ -80,4 +82,19 @@ class SearchTermTest < ActiveSupport::TestCase | ||
80 | assert st2.score > 0, "Did not calculate st2 score." | 82 | assert st2.score > 0, "Did not calculate st2 score." |
81 | end | 83 | end |
82 | 84 | ||
85 | + should 'the older the occurrence the less it should influence the score' do | ||
86 | + st1 = SearchTerm.find_or_create('st1', Environment.default) | ||
87 | + SearchTermOccurrence.create!(:search_term => st1, :total => 10, :indexed => 3, :created_at => 1.month.ago) | ||
88 | + SearchTermOccurrence.create!(:search_term => st1, :total => 20, :indexed => 8, :created_at => 1.month.ago) | ||
89 | + st2 = SearchTerm.find_or_create('st2', Environment.default) | ||
90 | + SearchTermOccurrence.create!(:search_term => st2, :total => 10, :indexed => 3, :created_at => 2.months.ago) | ||
91 | + SearchTermOccurrence.create!(:search_term => st2, :total => 20, :indexed => 8, :created_at => 2.months.ago) | ||
92 | + | ||
93 | + SearchTerm.calculate_scores | ||
94 | + st1.reload | ||
95 | + st2.reload | ||
96 | + | ||
97 | + assert st1.score > st2.score, "Older occurrences are not influencing score less than newer ones." | ||
98 | + end | ||
99 | + | ||
83 | end | 100 | end |