Commit 20ed507fdbd5166477c79aec137ae7bbde609ebf
1 parent
1365999b
Exists in
master
and in
29 other branches
[search-improvements] Normalizing occurrence and relevance scores
Showing
5 changed files
with
81 additions
and
20 deletions
Show diff stats
app/models/search_term.rb
... | ... | @@ -8,26 +8,55 @@ class SearchTerm < ActiveRecord::Base |
8 | 8 | attr_accessible :term, :context, :asset |
9 | 9 | |
10 | 10 | def self.calculate_scores |
11 | - find_each { |search_term| search_term.calculate_score } | |
11 | + os = occurrences_scores | |
12 | + find_each { |search_term| search_term.calculate_score(os) } | |
12 | 13 | end |
13 | 14 | |
14 | 15 | def self.find_or_create(term, context, asset='all') |
15 | 16 | context.search_terms.where(:term => term, :asset => asset).first || context.search_terms.create!(:term => term, :asset=> asset) |
16 | 17 | end |
17 | 18 | |
18 | - def calculate_score | |
19 | + # Fast way of getting the occurrences score for each search_term. Ugly but fast! | |
20 | + # | |
21 | + # Each occurrence of a search_term has a score that is smaller the older the | |
22 | + # occurrence happened. We subtract the amount of time between now and the | |
23 | + # moment it happened from the total time any occurrence is valid to happen. E.g.: | |
24 | + # The expiration time is 100 days and an occurrence happened 3 days ago. | |
25 | + # Therefore the score is 97. Them we sum every score to get the total score | |
26 | + # for a search term. | |
27 | + def self.occurrences_scores | |
28 | + ActiveSupport::OrderedHash[*ActiveRecord::Base.connection.execute( | |
29 | + joins(:occurrences). | |
30 | + select("search_terms.id, sum(#{SearchTermOccurrence::EXPIRATION_TIME.to_i} - extract(epoch from (now() - search_term_occurrences.created_at))) as value"). | |
31 | + where("search_term_occurrences.created_at > ?", DateTime.now - SearchTermOccurrence::EXPIRATION_TIME). | |
32 | + group("search_terms.id"). | |
33 | + order('value DESC'). | |
34 | + to_sql | |
35 | + ).map {|result| [result['id'].to_i, result['value'].to_i]}.flatten] | |
36 | + end | |
37 | + | |
38 | + def calculate_occurrence(occurrences_scores) | |
39 | + max_score = occurrences_scores.first[1] | |
40 | + (occurrences_scores[id]/max_score.to_f)*100 | |
41 | + end | |
42 | + | |
43 | + def calculate_relevance(valid_occurrences) | |
44 | + indexed = valid_occurrences.last.indexed.to_f | |
45 | + total = valid_occurrences.last.total.to_f | |
46 | + (1 - indexed/total)*100 | |
47 | + end | |
48 | + | |
49 | + def calculate_score(occurrences_scores) | |
19 | 50 | valid_occurrences = occurrences.valid |
20 | 51 | if valid_occurrences.present? |
21 | - indexed = valid_occurrences.last.indexed | |
22 | - total = valid_occurrences.last.total | |
23 | - # Using the formula described on this paper: http://www.soi.city.ac.uk/~ser/papers/RSJ76.pdf | |
24 | - current_relevance = indexed > 0 && total >= indexed ? -Math.log(indexed.to_f/total.to_f) : 0 | |
25 | - # Damp number of occurrences with log function to decrease it's effect over relevance. | |
26 | - damped_occurrences = Math.log(valid_occurrences.count) | |
27 | - self.score = (damped_occurrences * current_relevance).to_f | |
52 | + # These scores vary from 1~100 | |
53 | + self.occurrence_score = calculate_occurrence(occurrences_scores) | |
54 | + self.relevance_score = calculate_relevance(valid_occurrences) | |
28 | 55 | else |
29 | - self.score = 0 | |
56 | + self.occurrence_score = 0 | |
57 | + self.relevance_score = 0 | |
30 | 58 | end |
59 | + self.score = (occurrence_score * relevance_score)/100.0 | |
31 | 60 | self.save! |
32 | 61 | end |
33 | 62 | end | ... | ... |
app/models/search_term_occurrence.rb
... | ... | @@ -3,8 +3,7 @@ class SearchTermOccurrence < ActiveRecord::Base |
3 | 3 | validates_presence_of :search_term |
4 | 4 | attr_accessible :search_term, :created_at, :total, :indexed |
5 | 5 | |
6 | - #TODO Verify this value | |
7 | - EXPIRATION_TIME = 1.month | |
6 | + EXPIRATION_TIME = 1.year | |
8 | 7 | |
9 | - scope :valid, :conditions => ["search_term_occurrences.created_at >= ?", DateTime.now - EXPIRATION_TIME] | |
8 | + scope :valid, :conditions => ["search_term_occurrences.created_at > ?", DateTime.now - EXPIRATION_TIME] | |
10 | 9 | end | ... | ... |
db/migrate/20140507205338_create_search_terms.rb
... | ... | @@ -5,13 +5,23 @@ class CreateSearchTerms < ActiveRecord::Migration |
5 | 5 | t.references :context, :polymorphic => true |
6 | 6 | t.string :asset, :default => 'all' |
7 | 7 | t.float :score, :default => 0 |
8 | + t.float :relevance_score, :default => 0 | |
9 | + t.float :occurrence_score, :default => 0 | |
8 | 10 | end |
9 | 11 | |
10 | - add_index :search_terms, [:term, :asset, :score] | |
12 | + add_index :search_terms, :term | |
13 | + add_index :search_terms, :asset | |
14 | + add_index :search_terms, :score | |
15 | + add_index :search_terms, :relevance_score | |
16 | + add_index :search_terms, :occurrence_score | |
11 | 17 | end |
12 | 18 | |
13 | 19 | def down |
14 | - remove_index :search_terms, [:term, :asset, :score] | |
20 | + remove_index :search_terms, :term | |
21 | + remove_index :search_terms, :asset | |
22 | + remove_index :search_terms, :score | |
23 | + remove_index :search_terms, :relevance_score | |
24 | + remove_index :search_terms, :occurrence_score | |
15 | 25 | drop_table :search_terms |
16 | 26 | end |
17 | 27 | end | ... | ... |
db/schema.rb
... | ... | @@ -563,11 +563,17 @@ ActiveRecord::Schema.define(:version => 20140507205338) do |
563 | 563 | t.string "term" |
564 | 564 | t.integer "context_id" |
565 | 565 | t.string "context_type" |
566 | - t.string "asset", :default => "all" | |
567 | - t.float "score", :default => 0.0 | |
566 | + t.string "asset", :default => "all" | |
567 | + t.float "score", :default => 0.0 | |
568 | + t.float "relevance_score", :default => 0.0 | |
569 | + t.float "occurrence_score", :default => 0.0 | |
568 | 570 | end |
569 | 571 | |
570 | - add_index "search_terms", ["term", "asset", "score"], :name => "index_search_terms_on_term_and_asset_and_score" | |
572 | + add_index "search_terms", ["asset"], :name => "index_search_terms_on_asset" | |
573 | + add_index "search_terms", ["occurrence_score"], :name => "index_search_terms_on_occurrence_score" | |
574 | + add_index "search_terms", ["relevance_score"], :name => "index_search_terms_on_relevance_score" | |
575 | + add_index "search_terms", ["score"], :name => "index_search_terms_on_score" | |
576 | + add_index "search_terms", ["term"], :name => "index_search_terms_on_term" | |
571 | 577 | |
572 | 578 | create_table "sessions", :force => true do |t| |
573 | 579 | t.string "session_id", :null => false | ... | ... |
test/unit/search_term_test.rb
... | ... | @@ -51,14 +51,16 @@ class SearchTermTest < ActiveSupport::TestCase |
51 | 51 | SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) |
52 | 52 | # Search term must happens at least two times to be considered |
53 | 53 | SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) |
54 | - search_term.calculate_score | |
54 | + SearchTerm.calculate_scores | |
55 | + search_term.reload | |
55 | 56 | assert search_term.score > 0, "Score was not calculated." |
56 | 57 | end |
57 | 58 | |
58 | 59 | should 'not consider expired occurrences to calculate the score' do |
59 | 60 | search_term = SearchTerm.find_or_create('universe', Environment.default) |
60 | 61 | occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) |
61 | - search_term.calculate_score | |
62 | + SearchTerm.calculate_scores | |
63 | + search_term.reload | |
62 | 64 | assert search_term.score == 0, "Considered expired occurrence to calculate the score." |
63 | 65 | end |
64 | 66 | |
... | ... | @@ -80,4 +82,19 @@ class SearchTermTest < ActiveSupport::TestCase |
80 | 82 | assert st2.score > 0, "Did not calculate st2 score." |
81 | 83 | end |
82 | 84 | |
85 | + should 'the older the occurrence the less it should influence the score' do | |
86 | + st1 = SearchTerm.find_or_create('st1', Environment.default) | |
87 | + SearchTermOccurrence.create!(:search_term => st1, :total => 10, :indexed => 3, :created_at => 1.month.ago) | |
88 | + SearchTermOccurrence.create!(:search_term => st1, :total => 20, :indexed => 8, :created_at => 1.month.ago) | |
89 | + st2 = SearchTerm.find_or_create('st2', Environment.default) | |
90 | + SearchTermOccurrence.create!(:search_term => st2, :total => 10, :indexed => 3, :created_at => 2.months.ago) | |
91 | + SearchTermOccurrence.create!(:search_term => st2, :total => 20, :indexed => 8, :created_at => 2.months.ago) | |
92 | + | |
93 | + SearchTerm.calculate_scores | |
94 | + st1.reload | |
95 | + st2.reload | |
96 | + | |
97 | + assert st1.score > st2.score, "Older occurrences are not influencing score less than newer ones." | |
98 | + end | |
99 | + | |
83 | 100 | end | ... | ... |