Commit b374f2c57ee9a01617191cb02753613d6e47d674
1 parent
ab66aa9b
Exists in
master
and in
22 other branches
[search-improvements] Add infrastructure for search_terms score calculation
Showing
9 changed files
with
211 additions
and
1 deletions
Show diff stats
app/models/environment.rb
| ... | ... | @@ -10,6 +10,7 @@ class Environment < ActiveRecord::Base |
| 10 | 10 | self.partial_updates = false |
| 11 | 11 | |
| 12 | 12 | has_many :tasks, :dependent => :destroy, :as => 'target' |
| 13 | + has_many :search_terms, :as => :context | |
| 13 | 14 | |
| 14 | 15 | IDENTIFY_SCRIPTS = /(php[0-9s]?|[sp]htm[l]?|pl|py|cgi|rb)/ |
| 15 | 16 | ... | ... |
app/models/profile.rb
| ... | ... | @@ -138,6 +138,8 @@ class Profile < ActiveRecord::Base |
| 138 | 138 | |
| 139 | 139 | has_many :comments_received, :class_name => 'Comment', :through => :articles, :source => :comments |
| 140 | 140 | |
| 141 | + has_many :search_terms, :as => :context | |
| 142 | + | |
| 141 | 143 | def scraps(scrap=nil) |
| 142 | 144 | scrap = scrap.is_a?(Scrap) ? scrap.id : scrap |
| 143 | 145 | scrap.nil? ? Scrap.all_scraps(self) : Scrap.all_scraps(self).find(scrap) | ... | ... |
| ... | ... | @@ -0,0 +1,34 @@ |
| 1 | +class SearchTerm < ActiveRecord::Base | |
| 2 | + validates_presence_of :term, :context | |
| 3 | + validates_uniqueness_of :term, :scope => [:context_id, :context_type, :asset] | |
| 4 | + | |
| 5 | + belongs_to :context, :polymorphic => true | |
| 6 | + has_many :occurrences, :class_name => 'SearchTermOccurrence' | |
| 7 | + | |
| 8 | + attr_accessible :term, :context, :asset | |
| 9 | + | |
| 10 | + def self.calculate_scores | |
| 11 | + find_each { |search_term| search_term.calculate_score } | |
| 12 | + end | |
| 13 | + | |
| 14 | + def self.find_or_create(term, context, asset='all') | |
| 15 | + context.search_terms.where(:term => term, :asset => asset).first || context.search_terms.create!(:term => term, :asset=> asset) | |
| 16 | + end | |
| 17 | + | |
| 18 | + before_save :calculate_score | |
| 19 | + | |
| 20 | + def calculate_score | |
| 21 | + valid_occurrences = occurrences.valid | |
| 22 | + if valid_occurrences.present? | |
| 23 | + indexed = valid_occurrences.last.indexed | |
| 24 | + total = valid_occurrences.last.total | |
| 25 | + # Using the formula described on this paper: http://www.soi.city.ac.uk/~ser/papers/RSJ76.pdf | |
| 26 | + current_relevance = indexed > 0 && total >= indexed ? -Math.log(indexed.to_f/total.to_f) : 0 | |
| 27 | + # Damp number of occurrences with log function to decrease it's effect over relevance. | |
| 28 | + damped_occurrences = Math.log(valid_occurrences.count) | |
| 29 | + self.score = (damped_occurrences * current_relevance).to_f | |
| 30 | + else | |
| 31 | + self.score = 0 | |
| 32 | + end | |
| 33 | + end | |
| 34 | +end | ... | ... |
| ... | ... | @@ -0,0 +1,10 @@ |
| 1 | +class SearchTermOccurrence < ActiveRecord::Base | |
| 2 | + belongs_to :search_term | |
| 3 | + validates_presence_of :search_term | |
| 4 | + attr_accessible :search_term, :created_at, :total, :indexed | |
| 5 | + | |
| 6 | + #TODO Verify this value | |
| 7 | + EXPIRATION_TIME = 1.month | |
| 8 | + | |
| 9 | + scope :valid, :conditions => ["search_term_occurrences.created_at >= ?", DateTime.now - EXPIRATION_TIME] | |
| 10 | +end | ... | ... |
db/migrate/20140507195924_create_search_term_occurrences.rb
0 → 100644
| ... | ... | @@ -0,0 +1,16 @@ |
| 1 | +class CreateSearchTermOccurrences < ActiveRecord::Migration | |
| 2 | + def up | |
| 3 | + create_table :search_term_occurrences do |t| | |
| 4 | + t.references :search_term | |
| 5 | + t.datetime :created_at | |
| 6 | + t.integer :total, :default => 0 | |
| 7 | + t.integer :indexed, :default => 0 | |
| 8 | + end | |
| 9 | + add_index :search_term_occurrences, :created_at | |
| 10 | + end | |
| 11 | + | |
| 12 | + def down | |
| 13 | + remove_index :search_term_occurrences, :created_at | |
| 14 | + drop_table :search_term_occurrences | |
| 15 | + end | |
| 16 | +end | ... | ... |
| ... | ... | @@ -0,0 +1,17 @@ |
| 1 | +class CreateSearchTerms < ActiveRecord::Migration | |
| 2 | + def up | |
| 3 | + create_table :search_terms do |t| | |
| 4 | + t.string :term | |
| 5 | + t.references :context, :polymorphic => true | |
| 6 | + t.string :asset, :default => 'all' | |
| 7 | + t.float :score, :default => 0 | |
| 8 | + end | |
| 9 | + | |
| 10 | + add_index :search_terms, [:term, :asset, :score] | |
| 11 | + end | |
| 12 | + | |
| 13 | + def down | |
| 14 | + remove_index :search_terms, [:term, :asset, :score] | |
| 15 | + drop_table :search_terms | |
| 16 | + end | |
| 17 | +end | ... | ... |
db/schema.rb
| ... | ... | @@ -11,7 +11,7 @@ |
| 11 | 11 | # |
| 12 | 12 | # It's strongly recommended to check this file into your version control system. |
| 13 | 13 | |
| 14 | -ActiveRecord::Schema.define(:version => 20140408172149) do | |
| 14 | +ActiveRecord::Schema.define(:version => 20140507205338) do | |
| 15 | 15 | |
| 16 | 16 | create_table "abuse_reports", :force => true do |t| |
| 17 | 17 | t.integer "reporter_id" |
| ... | ... | @@ -550,6 +550,25 @@ ActiveRecord::Schema.define(:version => 20140408172149) do |
| 550 | 550 | t.integer "context_id" |
| 551 | 551 | end |
| 552 | 552 | |
| 553 | + create_table "search_term_occurrences", :force => true do |t| | |
| 554 | + t.integer "search_term_id" | |
| 555 | + t.datetime "created_at" | |
| 556 | + t.integer "total", :default => 0 | |
| 557 | + t.integer "indexed", :default => 0 | |
| 558 | + end | |
| 559 | + | |
| 560 | + add_index "search_term_occurrences", ["created_at"], :name => "index_search_term_occurrences_on_created_at" | |
| 561 | + | |
| 562 | + create_table "search_terms", :force => true do |t| | |
| 563 | + t.string "term" | |
| 564 | + t.integer "context_id" | |
| 565 | + t.string "context_type" | |
| 566 | + t.string "asset", :default => "all" | |
| 567 | + t.float "score", :default => 0.0 | |
| 568 | + end | |
| 569 | + | |
| 570 | + add_index "search_terms", ["term", "asset", "score"], :name => "index_search_terms_on_term_and_asset_and_score" | |
| 571 | + | |
| 553 | 572 | create_table "sessions", :force => true do |t| |
| 554 | 573 | t.string "session_id", :null => false |
| 555 | 574 | t.text "data" | ... | ... |
| ... | ... | @@ -0,0 +1,30 @@ |
| 1 | +require 'test_helper' | |
| 2 | + | |
| 3 | +class SearchTermOccurrenceTest < ActiveSupport::TestCase | |
| 4 | + | |
| 5 | + def setup | |
| 6 | + @search_term = SearchTerm.find_or_create('universe', Environment.default) | |
| 7 | + end | |
| 8 | + | |
| 9 | + attr_reader :search_term | |
| 10 | + | |
| 11 | + should 'have term' do | |
| 12 | + search_term_occurrence = SearchTermOccurrence.new | |
| 13 | + assert !search_term_occurrence.valid? | |
| 14 | + assert search_term_occurrence.errors.has_key?(:search_term) | |
| 15 | + end | |
| 16 | + | |
| 17 | + should 'create a search term occurence' do | |
| 18 | + assert_nothing_raised do | |
| 19 | + SearchTermOccurrence.create!(:search_term => search_term) | |
| 20 | + end | |
| 21 | + end | |
| 22 | + | |
| 23 | + should 'fetch only valid occurrences' do | |
| 24 | + o1 = SearchTermOccurrence.create!(:search_term => search_term) | |
| 25 | + o2 = SearchTermOccurrence.create!(:search_term => search_term) | |
| 26 | + o3 = SearchTermOccurrence.create!(:search_term => search_term, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) | |
| 27 | + | |
| 28 | + assert_equivalent [o1,o2], search_term.occurrences.valid | |
| 29 | + end | |
| 30 | +end | ... | ... |
| ... | ... | @@ -0,0 +1,81 @@ |
| 1 | +require 'test_helper' | |
| 2 | + | |
| 3 | +class SearchTermTest < ActiveSupport::TestCase | |
| 4 | + should 'have term' do | |
| 5 | + search_term = SearchTerm.new | |
| 6 | + assert !search_term.valid? | |
| 7 | + assert search_term.errors.has_key?(:term) | |
| 8 | + end | |
| 9 | + | |
| 10 | + should 'have context' do | |
| 11 | + search_term = SearchTerm.new | |
| 12 | + assert !search_term.valid? | |
| 13 | + assert search_term.errors.has_key?(:context) | |
| 14 | + end | |
| 15 | + | |
| 16 | + should 'have unique term within specific context and asset' do | |
| 17 | + SearchTerm.create!(:term => 'galaxy', :context => Environment.default, :asset => 'universe') | |
| 18 | + search_term = SearchTerm.new(:term => 'galaxy', :context => Environment.default, :asset => 'universe') | |
| 19 | + search_term.valid? | |
| 20 | + assert search_term.errors.has_key?(:term) | |
| 21 | + | |
| 22 | + search_term.asset = 'alternate_universe' | |
| 23 | + search_term.valid? | |
| 24 | + assert !search_term.errors.has_key?(:term) | |
| 25 | + end | |
| 26 | + | |
| 27 | + should 'create a search term' do | |
| 28 | + assert_nothing_raised do | |
| 29 | + SearchTerm.create!(:term => 'universe', :context => Environment.default) | |
| 30 | + end | |
| 31 | + end | |
| 32 | + | |
| 33 | + should 'find or create by term' do | |
| 34 | + assert_difference 'SearchTerm.count', 1 do | |
| 35 | + SearchTerm.find_or_create('universe', Environment.default) | |
| 36 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | |
| 37 | + assert_equal 'universe', search_term.term | |
| 38 | + end | |
| 39 | + end | |
| 40 | + | |
| 41 | + should 'have occurrences' do | |
| 42 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | |
| 43 | + o1 = SearchTermOccurrence.create!(:search_term => search_term) | |
| 44 | + o2 = SearchTermOccurrence.create!(:search_term => search_term) | |
| 45 | + | |
| 46 | + assert_equivalent [o1,o2], search_term.occurrences | |
| 47 | + end | |
| 48 | + | |
| 49 | + should 'calculate score' do | |
| 50 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | |
| 51 | + occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) | |
| 52 | + search_term.save! | |
| 53 | + assert search_term.score > 0, "Score was not calculated." | |
| 54 | + end | |
| 55 | + | |
| 56 | + should 'not consider expired occurrences to calculate the score' do | |
| 57 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | |
| 58 | + occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) | |
| 59 | + search_term.save! | |
| 60 | + assert search_term.score == 0, "Considered expired occurrence to calculate the score." | |
| 61 | + end | |
| 62 | + | |
| 63 | + should 'calculate search_terms scores' do | |
| 64 | + st1 = SearchTerm.create!(:term => 'st1') | |
| 65 | + SearchTermOccurrence.create!(:search_term => st1, :total => 10, :indexed => 3) | |
| 66 | + SearchTermOccurrence.create!(:search_term => st1, :total => 20, :indexed => 8) | |
| 67 | + SearchTermOccurrence.create!(:search_term => st1, :total => 30, :indexed => 9) | |
| 68 | + st2 = SearchTerm.create!(:term => 'st2') | |
| 69 | + SearchTermOccurrence.create!(:search_term => st2, :total => 10, :indexed => 7) | |
| 70 | + SearchTermOccurrence.create!(:search_term => st2, :total => 20, :indexed => 16) | |
| 71 | + SearchTermOccurrence.create!(:search_term => st2, :total => 30, :indexed => 21) | |
| 72 | + | |
| 73 | + SearchTerm.calculate_scores | |
| 74 | + st1.reload | |
| 75 | + st2.reload | |
| 76 | + | |
| 77 | + assert st1.score > 0, "Did not calculate st1 score." | |
| 78 | + assert st2.score > 0, "Did not calculate st2 score." | |
| 79 | + end | |
| 80 | + | |
| 81 | +end | ... | ... |