Commit 20ed507fdbd5166477c79aec137ae7bbde609ebf

Authored by Rodrigo Souto
1 parent 1365999b

[search-improvements] Normalizing occurrence and relevance scores

app/models/search_term.rb
@@ -8,26 +8,55 @@ class SearchTerm < ActiveRecord::Base @@ -8,26 +8,55 @@ class SearchTerm < ActiveRecord::Base
8 attr_accessible :term, :context, :asset 8 attr_accessible :term, :context, :asset
9 9
10 def self.calculate_scores 10 def self.calculate_scores
11 - find_each { |search_term| search_term.calculate_score } 11 + os = occurrences_scores
  12 + find_each { |search_term| search_term.calculate_score(os) }
12 end 13 end
13 14
14 def self.find_or_create(term, context, asset='all') 15 def self.find_or_create(term, context, asset='all')
15 context.search_terms.where(:term => term, :asset => asset).first || context.search_terms.create!(:term => term, :asset=> asset) 16 context.search_terms.where(:term => term, :asset => asset).first || context.search_terms.create!(:term => term, :asset=> asset)
16 end 17 end
17 18
18 - def calculate_score 19 + # Fast way of getting the occurrences score for each search_term. Ugly but fast!
  20 + #
  21 + # Each occurrence of a search_term has a score that is smaller the older the
  22 + # occurrence happened. We subtract the amount of time between now and the
  23 + # moment it happened from the total time any occurrence is valid to happen. E.g.:
  24 + # The expiration time is 100 days and an occurrence happened 3 days ago.
  25 + # Therefore the score is 97. Them we sum every score to get the total score
  26 + # for a search term.
  27 + def self.occurrences_scores
  28 + ActiveSupport::OrderedHash[*ActiveRecord::Base.connection.execute(
  29 + joins(:occurrences).
  30 + select("search_terms.id, sum(#{SearchTermOccurrence::EXPIRATION_TIME.to_i} - extract(epoch from (now() - search_term_occurrences.created_at))) as value").
  31 + where("search_term_occurrences.created_at > ?", DateTime.now - SearchTermOccurrence::EXPIRATION_TIME).
  32 + group("search_terms.id").
  33 + order('value DESC').
  34 + to_sql
  35 + ).map {|result| [result['id'].to_i, result['value'].to_i]}.flatten]
  36 + end
  37 +
  38 + def calculate_occurrence(occurrences_scores)
  39 + max_score = occurrences_scores.first[1]
  40 + (occurrences_scores[id]/max_score.to_f)*100
  41 + end
  42 +
  43 + def calculate_relevance(valid_occurrences)
  44 + indexed = valid_occurrences.last.indexed.to_f
  45 + total = valid_occurrences.last.total.to_f
  46 + (1 - indexed/total)*100
  47 + end
  48 +
  49 + def calculate_score(occurrences_scores)
19 valid_occurrences = occurrences.valid 50 valid_occurrences = occurrences.valid
20 if valid_occurrences.present? 51 if valid_occurrences.present?
21 - indexed = valid_occurrences.last.indexed  
22 - total = valid_occurrences.last.total  
23 - # Using the formula described on this paper: http://www.soi.city.ac.uk/~ser/papers/RSJ76.pdf  
24 - current_relevance = indexed > 0 && total >= indexed ? -Math.log(indexed.to_f/total.to_f) : 0  
25 - # Damp number of occurrences with log function to decrease it's effect over relevance.  
26 - damped_occurrences = Math.log(valid_occurrences.count)  
27 - self.score = (damped_occurrences * current_relevance).to_f 52 + # These scores vary from 1~100
  53 + self.occurrence_score = calculate_occurrence(occurrences_scores)
  54 + self.relevance_score = calculate_relevance(valid_occurrences)
28 else 55 else
29 - self.score = 0 56 + self.occurrence_score = 0
  57 + self.relevance_score = 0
30 end 58 end
  59 + self.score = (occurrence_score * relevance_score)/100.0
31 self.save! 60 self.save!
32 end 61 end
33 end 62 end
app/models/search_term_occurrence.rb
@@ -3,8 +3,7 @@ class SearchTermOccurrence < ActiveRecord::Base @@ -3,8 +3,7 @@ class SearchTermOccurrence < ActiveRecord::Base
3 validates_presence_of :search_term 3 validates_presence_of :search_term
4 attr_accessible :search_term, :created_at, :total, :indexed 4 attr_accessible :search_term, :created_at, :total, :indexed
5 5
6 - #TODO Verify this value  
7 - EXPIRATION_TIME = 1.month 6 + EXPIRATION_TIME = 1.year
8 7
9 - scope :valid, :conditions => ["search_term_occurrences.created_at >= ?", DateTime.now - EXPIRATION_TIME] 8 + scope :valid, :conditions => ["search_term_occurrences.created_at > ?", DateTime.now - EXPIRATION_TIME]
10 end 9 end
db/migrate/20140507205338_create_search_terms.rb
@@ -5,13 +5,23 @@ class CreateSearchTerms < ActiveRecord::Migration @@ -5,13 +5,23 @@ class CreateSearchTerms < ActiveRecord::Migration
5 t.references :context, :polymorphic => true 5 t.references :context, :polymorphic => true
6 t.string :asset, :default => 'all' 6 t.string :asset, :default => 'all'
7 t.float :score, :default => 0 7 t.float :score, :default => 0
  8 + t.float :relevance_score, :default => 0
  9 + t.float :occurrence_score, :default => 0
8 end 10 end
9 11
10 - add_index :search_terms, [:term, :asset, :score] 12 + add_index :search_terms, :term
  13 + add_index :search_terms, :asset
  14 + add_index :search_terms, :score
  15 + add_index :search_terms, :relevance_score
  16 + add_index :search_terms, :occurrence_score
11 end 17 end
12 18
13 def down 19 def down
14 - remove_index :search_terms, [:term, :asset, :score] 20 + remove_index :search_terms, :term
  21 + remove_index :search_terms, :asset
  22 + remove_index :search_terms, :score
  23 + remove_index :search_terms, :relevance_score
  24 + remove_index :search_terms, :occurrence_score
15 drop_table :search_terms 25 drop_table :search_terms
16 end 26 end
17 end 27 end
@@ -563,11 +563,17 @@ ActiveRecord::Schema.define(:version => 20140507205338) do @@ -563,11 +563,17 @@ ActiveRecord::Schema.define(:version => 20140507205338) do
563 t.string "term" 563 t.string "term"
564 t.integer "context_id" 564 t.integer "context_id"
565 t.string "context_type" 565 t.string "context_type"
566 - t.string "asset", :default => "all"  
567 - t.float "score", :default => 0.0 566 + t.string "asset", :default => "all"
  567 + t.float "score", :default => 0.0
  568 + t.float "relevance_score", :default => 0.0
  569 + t.float "occurrence_score", :default => 0.0
568 end 570 end
569 571
570 - add_index "search_terms", ["term", "asset", "score"], :name => "index_search_terms_on_term_and_asset_and_score" 572 + add_index "search_terms", ["asset"], :name => "index_search_terms_on_asset"
  573 + add_index "search_terms", ["occurrence_score"], :name => "index_search_terms_on_occurrence_score"
  574 + add_index "search_terms", ["relevance_score"], :name => "index_search_terms_on_relevance_score"
  575 + add_index "search_terms", ["score"], :name => "index_search_terms_on_score"
  576 + add_index "search_terms", ["term"], :name => "index_search_terms_on_term"
571 577
572 create_table "sessions", :force => true do |t| 578 create_table "sessions", :force => true do |t|
573 t.string "session_id", :null => false 579 t.string "session_id", :null => false
test/unit/search_term_test.rb
@@ -51,14 +51,16 @@ class SearchTermTest < ActiveSupport::TestCase @@ -51,14 +51,16 @@ class SearchTermTest < ActiveSupport::TestCase
51 SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) 51 SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3)
52 # Search term must happens at least two times to be considered 52 # Search term must happens at least two times to be considered
53 SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) 53 SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3)
54 - search_term.calculate_score 54 + SearchTerm.calculate_scores
  55 + search_term.reload
55 assert search_term.score > 0, "Score was not calculated." 56 assert search_term.score > 0, "Score was not calculated."
56 end 57 end
57 58
58 should 'not consider expired occurrences to calculate the score' do 59 should 'not consider expired occurrences to calculate the score' do
59 search_term = SearchTerm.find_or_create('universe', Environment.default) 60 search_term = SearchTerm.find_or_create('universe', Environment.default)
60 occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) 61 occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day))
61 - search_term.calculate_score 62 + SearchTerm.calculate_scores
  63 + search_term.reload
62 assert search_term.score == 0, "Considered expired occurrence to calculate the score." 64 assert search_term.score == 0, "Considered expired occurrence to calculate the score."
63 end 65 end
64 66
@@ -80,4 +82,19 @@ class SearchTermTest < ActiveSupport::TestCase @@ -80,4 +82,19 @@ class SearchTermTest < ActiveSupport::TestCase
80 assert st2.score > 0, "Did not calculate st2 score." 82 assert st2.score > 0, "Did not calculate st2 score."
81 end 83 end
82 84
  85 + should 'the older the occurrence the less it should influence the score' do
  86 + st1 = SearchTerm.find_or_create('st1', Environment.default)
  87 + SearchTermOccurrence.create!(:search_term => st1, :total => 10, :indexed => 3, :created_at => 1.month.ago)
  88 + SearchTermOccurrence.create!(:search_term => st1, :total => 20, :indexed => 8, :created_at => 1.month.ago)
  89 + st2 = SearchTerm.find_or_create('st2', Environment.default)
  90 + SearchTermOccurrence.create!(:search_term => st2, :total => 10, :indexed => 3, :created_at => 2.months.ago)
  91 + SearchTermOccurrence.create!(:search_term => st2, :total => 20, :indexed => 8, :created_at => 2.months.ago)
  92 +
  93 + SearchTerm.calculate_scores
  94 + st1.reload
  95 + st2.reload
  96 +
  97 + assert st1.score > st2.score, "Older occurrences are not influencing score less than newer ones."
  98 + end
  99 +
83 end 100 end