Commit b374f2c57ee9a01617191cb02753613d6e47d674
1 parent
ab66aa9b
Exists in
master
and in
22 other branches
[search-improvements] Add infrastructure for search_terms score calculation
Showing
9 changed files
with
211 additions
and
1 deletions
Show diff stats
app/models/environment.rb
| @@ -10,6 +10,7 @@ class Environment < ActiveRecord::Base | @@ -10,6 +10,7 @@ class Environment < ActiveRecord::Base | ||
| 10 | self.partial_updates = false | 10 | self.partial_updates = false |
| 11 | 11 | ||
| 12 | has_many :tasks, :dependent => :destroy, :as => 'target' | 12 | has_many :tasks, :dependent => :destroy, :as => 'target' |
| 13 | + has_many :search_terms, :as => :context | ||
| 13 | 14 | ||
| 14 | IDENTIFY_SCRIPTS = /(php[0-9s]?|[sp]htm[l]?|pl|py|cgi|rb)/ | 15 | IDENTIFY_SCRIPTS = /(php[0-9s]?|[sp]htm[l]?|pl|py|cgi|rb)/ |
| 15 | 16 |
app/models/profile.rb
| @@ -138,6 +138,8 @@ class Profile < ActiveRecord::Base | @@ -138,6 +138,8 @@ class Profile < ActiveRecord::Base | ||
| 138 | 138 | ||
| 139 | has_many :comments_received, :class_name => 'Comment', :through => :articles, :source => :comments | 139 | has_many :comments_received, :class_name => 'Comment', :through => :articles, :source => :comments |
| 140 | 140 | ||
| 141 | + has_many :search_terms, :as => :context | ||
| 142 | + | ||
| 141 | def scraps(scrap=nil) | 143 | def scraps(scrap=nil) |
| 142 | scrap = scrap.is_a?(Scrap) ? scrap.id : scrap | 144 | scrap = scrap.is_a?(Scrap) ? scrap.id : scrap |
| 143 | scrap.nil? ? Scrap.all_scraps(self) : Scrap.all_scraps(self).find(scrap) | 145 | scrap.nil? ? Scrap.all_scraps(self) : Scrap.all_scraps(self).find(scrap) |
| @@ -0,0 +1,34 @@ | @@ -0,0 +1,34 @@ | ||
| 1 | +class SearchTerm < ActiveRecord::Base | ||
| 2 | + validates_presence_of :term, :context | ||
| 3 | + validates_uniqueness_of :term, :scope => [:context_id, :context_type, :asset] | ||
| 4 | + | ||
| 5 | + belongs_to :context, :polymorphic => true | ||
| 6 | + has_many :occurrences, :class_name => 'SearchTermOccurrence' | ||
| 7 | + | ||
| 8 | + attr_accessible :term, :context, :asset | ||
| 9 | + | ||
| 10 | + def self.calculate_scores | ||
| 11 | + find_each { |search_term| search_term.calculate_score } | ||
| 12 | + end | ||
| 13 | + | ||
| 14 | + def self.find_or_create(term, context, asset='all') | ||
| 15 | + context.search_terms.where(:term => term, :asset => asset).first || context.search_terms.create!(:term => term, :asset=> asset) | ||
| 16 | + end | ||
| 17 | + | ||
| 18 | + before_save :calculate_score | ||
| 19 | + | ||
| 20 | + def calculate_score | ||
| 21 | + valid_occurrences = occurrences.valid | ||
| 22 | + if valid_occurrences.present? | ||
| 23 | + indexed = valid_occurrences.last.indexed | ||
| 24 | + total = valid_occurrences.last.total | ||
| 25 | + # Using the formula described on this paper: http://www.soi.city.ac.uk/~ser/papers/RSJ76.pdf | ||
| 26 | + current_relevance = indexed > 0 && total >= indexed ? -Math.log(indexed.to_f/total.to_f) : 0 | ||
| 27 | + # Damp number of occurrences with log function to decrease it's effect over relevance. | ||
| 28 | + damped_occurrences = Math.log(valid_occurrences.count) | ||
| 29 | + self.score = (damped_occurrences * current_relevance).to_f | ||
| 30 | + else | ||
| 31 | + self.score = 0 | ||
| 32 | + end | ||
| 33 | + end | ||
| 34 | +end |
| @@ -0,0 +1,10 @@ | @@ -0,0 +1,10 @@ | ||
| 1 | +class SearchTermOccurrence < ActiveRecord::Base | ||
| 2 | + belongs_to :search_term | ||
| 3 | + validates_presence_of :search_term | ||
| 4 | + attr_accessible :search_term, :created_at, :total, :indexed | ||
| 5 | + | ||
| 6 | + #TODO Verify this value | ||
| 7 | + EXPIRATION_TIME = 1.month | ||
| 8 | + | ||
| 9 | + scope :valid, :conditions => ["search_term_occurrences.created_at >= ?", DateTime.now - EXPIRATION_TIME] | ||
| 10 | +end |
db/migrate/20140507195924_create_search_term_occurrences.rb
0 → 100644
| @@ -0,0 +1,16 @@ | @@ -0,0 +1,16 @@ | ||
| 1 | +class CreateSearchTermOccurrences < ActiveRecord::Migration | ||
| 2 | + def up | ||
| 3 | + create_table :search_term_occurrences do |t| | ||
| 4 | + t.references :search_term | ||
| 5 | + t.datetime :created_at | ||
| 6 | + t.integer :total, :default => 0 | ||
| 7 | + t.integer :indexed, :default => 0 | ||
| 8 | + end | ||
| 9 | + add_index :search_term_occurrences, :created_at | ||
| 10 | + end | ||
| 11 | + | ||
| 12 | + def down | ||
| 13 | + remove_index :search_term_occurrences, :created_at | ||
| 14 | + drop_table :search_term_occurrences | ||
| 15 | + end | ||
| 16 | +end |
| @@ -0,0 +1,17 @@ | @@ -0,0 +1,17 @@ | ||
| 1 | +class CreateSearchTerms < ActiveRecord::Migration | ||
| 2 | + def up | ||
| 3 | + create_table :search_terms do |t| | ||
| 4 | + t.string :term | ||
| 5 | + t.references :context, :polymorphic => true | ||
| 6 | + t.string :asset, :default => 'all' | ||
| 7 | + t.float :score, :default => 0 | ||
| 8 | + end | ||
| 9 | + | ||
| 10 | + add_index :search_terms, [:term, :asset, :score] | ||
| 11 | + end | ||
| 12 | + | ||
| 13 | + def down | ||
| 14 | + remove_index :search_terms, [:term, :asset, :score] | ||
| 15 | + drop_table :search_terms | ||
| 16 | + end | ||
| 17 | +end |
db/schema.rb
| @@ -11,7 +11,7 @@ | @@ -11,7 +11,7 @@ | ||
| 11 | # | 11 | # |
| 12 | # It's strongly recommended to check this file into your version control system. | 12 | # It's strongly recommended to check this file into your version control system. |
| 13 | 13 | ||
| 14 | -ActiveRecord::Schema.define(:version => 20140408172149) do | 14 | +ActiveRecord::Schema.define(:version => 20140507205338) do |
| 15 | 15 | ||
| 16 | create_table "abuse_reports", :force => true do |t| | 16 | create_table "abuse_reports", :force => true do |t| |
| 17 | t.integer "reporter_id" | 17 | t.integer "reporter_id" |
| @@ -550,6 +550,25 @@ ActiveRecord::Schema.define(:version => 20140408172149) do | @@ -550,6 +550,25 @@ ActiveRecord::Schema.define(:version => 20140408172149) do | ||
| 550 | t.integer "context_id" | 550 | t.integer "context_id" |
| 551 | end | 551 | end |
| 552 | 552 | ||
| 553 | + create_table "search_term_occurrences", :force => true do |t| | ||
| 554 | + t.integer "search_term_id" | ||
| 555 | + t.datetime "created_at" | ||
| 556 | + t.integer "total", :default => 0 | ||
| 557 | + t.integer "indexed", :default => 0 | ||
| 558 | + end | ||
| 559 | + | ||
| 560 | + add_index "search_term_occurrences", ["created_at"], :name => "index_search_term_occurrences_on_created_at" | ||
| 561 | + | ||
| 562 | + create_table "search_terms", :force => true do |t| | ||
| 563 | + t.string "term" | ||
| 564 | + t.integer "context_id" | ||
| 565 | + t.string "context_type" | ||
| 566 | + t.string "asset", :default => "all" | ||
| 567 | + t.float "score", :default => 0.0 | ||
| 568 | + end | ||
| 569 | + | ||
| 570 | + add_index "search_terms", ["term", "asset", "score"], :name => "index_search_terms_on_term_and_asset_and_score" | ||
| 571 | + | ||
| 553 | create_table "sessions", :force => true do |t| | 572 | create_table "sessions", :force => true do |t| |
| 554 | t.string "session_id", :null => false | 573 | t.string "session_id", :null => false |
| 555 | t.text "data" | 574 | t.text "data" |
| @@ -0,0 +1,30 @@ | @@ -0,0 +1,30 @@ | ||
| 1 | +require 'test_helper' | ||
| 2 | + | ||
| 3 | +class SearchTermOccurrenceTest < ActiveSupport::TestCase | ||
| 4 | + | ||
| 5 | + def setup | ||
| 6 | + @search_term = SearchTerm.find_or_create('universe', Environment.default) | ||
| 7 | + end | ||
| 8 | + | ||
| 9 | + attr_reader :search_term | ||
| 10 | + | ||
| 11 | + should 'have term' do | ||
| 12 | + search_term_occurrence = SearchTermOccurrence.new | ||
| 13 | + assert !search_term_occurrence.valid? | ||
| 14 | + assert search_term_occurrence.errors.has_key?(:search_term) | ||
| 15 | + end | ||
| 16 | + | ||
| 17 | + should 'create a search term occurence' do | ||
| 18 | + assert_nothing_raised do | ||
| 19 | + SearchTermOccurrence.create!(:search_term => search_term) | ||
| 20 | + end | ||
| 21 | + end | ||
| 22 | + | ||
| 23 | + should 'fetch only valid occurrences' do | ||
| 24 | + o1 = SearchTermOccurrence.create!(:search_term => search_term) | ||
| 25 | + o2 = SearchTermOccurrence.create!(:search_term => search_term) | ||
| 26 | + o3 = SearchTermOccurrence.create!(:search_term => search_term, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) | ||
| 27 | + | ||
| 28 | + assert_equivalent [o1,o2], search_term.occurrences.valid | ||
| 29 | + end | ||
| 30 | +end |
| @@ -0,0 +1,81 @@ | @@ -0,0 +1,81 @@ | ||
| 1 | +require 'test_helper' | ||
| 2 | + | ||
| 3 | +class SearchTermTest < ActiveSupport::TestCase | ||
| 4 | + should 'have term' do | ||
| 5 | + search_term = SearchTerm.new | ||
| 6 | + assert !search_term.valid? | ||
| 7 | + assert search_term.errors.has_key?(:term) | ||
| 8 | + end | ||
| 9 | + | ||
| 10 | + should 'have context' do | ||
| 11 | + search_term = SearchTerm.new | ||
| 12 | + assert !search_term.valid? | ||
| 13 | + assert search_term.errors.has_key?(:context) | ||
| 14 | + end | ||
| 15 | + | ||
| 16 | + should 'have unique term within specific context and asset' do | ||
| 17 | + SearchTerm.create!(:term => 'galaxy', :context => Environment.default, :asset => 'universe') | ||
| 18 | + search_term = SearchTerm.new(:term => 'galaxy', :context => Environment.default, :asset => 'universe') | ||
| 19 | + search_term.valid? | ||
| 20 | + assert search_term.errors.has_key?(:term) | ||
| 21 | + | ||
| 22 | + search_term.asset = 'alternate_universe' | ||
| 23 | + search_term.valid? | ||
| 24 | + assert !search_term.errors.has_key?(:term) | ||
| 25 | + end | ||
| 26 | + | ||
| 27 | + should 'create a search term' do | ||
| 28 | + assert_nothing_raised do | ||
| 29 | + SearchTerm.create!(:term => 'universe', :context => Environment.default) | ||
| 30 | + end | ||
| 31 | + end | ||
| 32 | + | ||
| 33 | + should 'find or create by term' do | ||
| 34 | + assert_difference 'SearchTerm.count', 1 do | ||
| 35 | + SearchTerm.find_or_create('universe', Environment.default) | ||
| 36 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | ||
| 37 | + assert_equal 'universe', search_term.term | ||
| 38 | + end | ||
| 39 | + end | ||
| 40 | + | ||
| 41 | + should 'have occurrences' do | ||
| 42 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | ||
| 43 | + o1 = SearchTermOccurrence.create!(:search_term => search_term) | ||
| 44 | + o2 = SearchTermOccurrence.create!(:search_term => search_term) | ||
| 45 | + | ||
| 46 | + assert_equivalent [o1,o2], search_term.occurrences | ||
| 47 | + end | ||
| 48 | + | ||
| 49 | + should 'calculate score' do | ||
| 50 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | ||
| 51 | + occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) | ||
| 52 | + search_term.save! | ||
| 53 | + assert search_term.score > 0, "Score was not calculated." | ||
| 54 | + end | ||
| 55 | + | ||
| 56 | + should 'not consider expired occurrences to calculate the score' do | ||
| 57 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | ||
| 58 | + occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) | ||
| 59 | + search_term.save! | ||
| 60 | + assert search_term.score == 0, "Considered expired occurrence to calculate the score." | ||
| 61 | + end | ||
| 62 | + | ||
| 63 | + should 'calculate search_terms scores' do | ||
| 64 | + st1 = SearchTerm.create!(:term => 'st1') | ||
| 65 | + SearchTermOccurrence.create!(:search_term => st1, :total => 10, :indexed => 3) | ||
| 66 | + SearchTermOccurrence.create!(:search_term => st1, :total => 20, :indexed => 8) | ||
| 67 | + SearchTermOccurrence.create!(:search_term => st1, :total => 30, :indexed => 9) | ||
| 68 | + st2 = SearchTerm.create!(:term => 'st2') | ||
| 69 | + SearchTermOccurrence.create!(:search_term => st2, :total => 10, :indexed => 7) | ||
| 70 | + SearchTermOccurrence.create!(:search_term => st2, :total => 20, :indexed => 16) | ||
| 71 | + SearchTermOccurrence.create!(:search_term => st2, :total => 30, :indexed => 21) | ||
| 72 | + | ||
| 73 | + SearchTerm.calculate_scores | ||
| 74 | + st1.reload | ||
| 75 | + st2.reload | ||
| 76 | + | ||
| 77 | + assert st1.score > 0, "Did not calculate st1 score." | ||
| 78 | + assert st2.score > 0, "Did not calculate st2 score." | ||
| 79 | + end | ||
| 80 | + | ||
| 81 | +end |