Commit 20ed507fdbd5166477c79aec137ae7bbde609ebf

Authored by Rodrigo Souto
1 parent 1365999b

[search-improvements] Normalizing occurrence and relevance scores

app/models/search_term.rb
... ... @@ -8,26 +8,55 @@ class SearchTerm < ActiveRecord::Base
8 8 attr_accessible :term, :context, :asset
9 9  
10 10 def self.calculate_scores
11   - find_each { |search_term| search_term.calculate_score }
  11 + os = occurrences_scores
  12 + find_each { |search_term| search_term.calculate_score(os) }
12 13 end
13 14  
14 15 def self.find_or_create(term, context, asset='all')
15 16 context.search_terms.where(:term => term, :asset => asset).first || context.search_terms.create!(:term => term, :asset=> asset)
16 17 end
17 18  
18   - def calculate_score
  19 + # Fast way of getting the occurrences score for each search_term. Ugly but fast!
  20 + #
  21 + # Each occurrence of a search_term has a score that is smaller the older the
  22 + # occurrence happened. We subtract the amount of time between now and the
  23 + # moment it happened from the total time any occurrence is valid to happen. E.g.:
  24 + # The expiration time is 100 days and an occurrence happened 3 days ago.
  25 + # Therefore the score is 97. Them we sum every score to get the total score
  26 + # for a search term.
  27 + def self.occurrences_scores
  28 + ActiveSupport::OrderedHash[*ActiveRecord::Base.connection.execute(
  29 + joins(:occurrences).
  30 + select("search_terms.id, sum(#{SearchTermOccurrence::EXPIRATION_TIME.to_i} - extract(epoch from (now() - search_term_occurrences.created_at))) as value").
  31 + where("search_term_occurrences.created_at > ?", DateTime.now - SearchTermOccurrence::EXPIRATION_TIME).
  32 + group("search_terms.id").
  33 + order('value DESC').
  34 + to_sql
  35 + ).map {|result| [result['id'].to_i, result['value'].to_i]}.flatten]
  36 + end
  37 +
  38 + def calculate_occurrence(occurrences_scores)
  39 + max_score = occurrences_scores.first[1]
  40 + (occurrences_scores[id]/max_score.to_f)*100
  41 + end
  42 +
  43 + def calculate_relevance(valid_occurrences)
  44 + indexed = valid_occurrences.last.indexed.to_f
  45 + total = valid_occurrences.last.total.to_f
  46 + (1 - indexed/total)*100
  47 + end
  48 +
  49 + def calculate_score(occurrences_scores)
19 50 valid_occurrences = occurrences.valid
20 51 if valid_occurrences.present?
21   - indexed = valid_occurrences.last.indexed
22   - total = valid_occurrences.last.total
23   - # Using the formula described on this paper: http://www.soi.city.ac.uk/~ser/papers/RSJ76.pdf
24   - current_relevance = indexed > 0 && total >= indexed ? -Math.log(indexed.to_f/total.to_f) : 0
25   - # Damp number of occurrences with log function to decrease it's effect over relevance.
26   - damped_occurrences = Math.log(valid_occurrences.count)
27   - self.score = (damped_occurrences * current_relevance).to_f
  52 + # These scores vary from 1~100
  53 + self.occurrence_score = calculate_occurrence(occurrences_scores)
  54 + self.relevance_score = calculate_relevance(valid_occurrences)
28 55 else
29   - self.score = 0
  56 + self.occurrence_score = 0
  57 + self.relevance_score = 0
30 58 end
  59 + self.score = (occurrence_score * relevance_score)/100.0
31 60 self.save!
32 61 end
33 62 end
... ...
app/models/search_term_occurrence.rb
... ... @@ -3,8 +3,7 @@ class SearchTermOccurrence < ActiveRecord::Base
3 3 validates_presence_of :search_term
4 4 attr_accessible :search_term, :created_at, :total, :indexed
5 5  
6   - #TODO Verify this value
7   - EXPIRATION_TIME = 1.month
  6 + EXPIRATION_TIME = 1.year
8 7  
9   - scope :valid, :conditions => ["search_term_occurrences.created_at >= ?", DateTime.now - EXPIRATION_TIME]
  8 + scope :valid, :conditions => ["search_term_occurrences.created_at > ?", DateTime.now - EXPIRATION_TIME]
10 9 end
... ...
db/migrate/20140507205338_create_search_terms.rb
... ... @@ -5,13 +5,23 @@ class CreateSearchTerms < ActiveRecord::Migration
5 5 t.references :context, :polymorphic => true
6 6 t.string :asset, :default => 'all'
7 7 t.float :score, :default => 0
  8 + t.float :relevance_score, :default => 0
  9 + t.float :occurrence_score, :default => 0
8 10 end
9 11  
10   - add_index :search_terms, [:term, :asset, :score]
  12 + add_index :search_terms, :term
  13 + add_index :search_terms, :asset
  14 + add_index :search_terms, :score
  15 + add_index :search_terms, :relevance_score
  16 + add_index :search_terms, :occurrence_score
11 17 end
12 18  
13 19 def down
14   - remove_index :search_terms, [:term, :asset, :score]
  20 + remove_index :search_terms, :term
  21 + remove_index :search_terms, :asset
  22 + remove_index :search_terms, :score
  23 + remove_index :search_terms, :relevance_score
  24 + remove_index :search_terms, :occurrence_score
15 25 drop_table :search_terms
16 26 end
17 27 end
... ...
db/schema.rb
... ... @@ -563,11 +563,17 @@ ActiveRecord::Schema.define(:version => 20140507205338) do
563 563 t.string "term"
564 564 t.integer "context_id"
565 565 t.string "context_type"
566   - t.string "asset", :default => "all"
567   - t.float "score", :default => 0.0
  566 + t.string "asset", :default => "all"
  567 + t.float "score", :default => 0.0
  568 + t.float "relevance_score", :default => 0.0
  569 + t.float "occurrence_score", :default => 0.0
568 570 end
569 571  
570   - add_index "search_terms", ["term", "asset", "score"], :name => "index_search_terms_on_term_and_asset_and_score"
  572 + add_index "search_terms", ["asset"], :name => "index_search_terms_on_asset"
  573 + add_index "search_terms", ["occurrence_score"], :name => "index_search_terms_on_occurrence_score"
  574 + add_index "search_terms", ["relevance_score"], :name => "index_search_terms_on_relevance_score"
  575 + add_index "search_terms", ["score"], :name => "index_search_terms_on_score"
  576 + add_index "search_terms", ["term"], :name => "index_search_terms_on_term"
571 577  
572 578 create_table "sessions", :force => true do |t|
573 579 t.string "session_id", :null => false
... ...
test/unit/search_term_test.rb
... ... @@ -51,14 +51,16 @@ class SearchTermTest < ActiveSupport::TestCase
51 51 SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3)
52 52 # Search term must happens at least two times to be considered
53 53 SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3)
54   - search_term.calculate_score
  54 + SearchTerm.calculate_scores
  55 + search_term.reload
55 56 assert search_term.score > 0, "Score was not calculated."
56 57 end
57 58  
58 59 should 'not consider expired occurrences to calculate the score' do
59 60 search_term = SearchTerm.find_or_create('universe', Environment.default)
60 61 occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day))
61   - search_term.calculate_score
  62 + SearchTerm.calculate_scores
  63 + search_term.reload
62 64 assert search_term.score == 0, "Considered expired occurrence to calculate the score."
63 65 end
64 66  
... ... @@ -80,4 +82,19 @@ class SearchTermTest < ActiveSupport::TestCase
80 82 assert st2.score > 0, "Did not calculate st2 score."
81 83 end
82 84  
  85 + should 'the older the occurrence the less it should influence the score' do
  86 + st1 = SearchTerm.find_or_create('st1', Environment.default)
  87 + SearchTermOccurrence.create!(:search_term => st1, :total => 10, :indexed => 3, :created_at => 1.month.ago)
  88 + SearchTermOccurrence.create!(:search_term => st1, :total => 20, :indexed => 8, :created_at => 1.month.ago)
  89 + st2 = SearchTerm.find_or_create('st2', Environment.default)
  90 + SearchTermOccurrence.create!(:search_term => st2, :total => 10, :indexed => 3, :created_at => 2.months.ago)
  91 + SearchTermOccurrence.create!(:search_term => st2, :total => 20, :indexed => 8, :created_at => 2.months.ago)
  92 +
  93 + SearchTerm.calculate_scores
  94 + st1.reload
  95 + st2.reload
  96 +
  97 + assert st1.score > st2.score, "Older occurrences are not influencing score less than newer ones."
  98 + end
  99 +
83 100 end
... ...