Commit b374f2c57ee9a01617191cb02753613d6e47d674

Authored by Rodrigo Souto
1 parent ab66aa9b

[search-improvements] Add infrastructure for search_terms score calculation

app/models/environment.rb
... ... @@ -10,6 +10,7 @@ class Environment < ActiveRecord::Base
10 10 self.partial_updates = false
11 11  
12 12 has_many :tasks, :dependent => :destroy, :as => 'target'
  13 + has_many :search_terms, :as => :context
13 14  
14 15 IDENTIFY_SCRIPTS = /(php[0-9s]?|[sp]htm[l]?|pl|py|cgi|rb)/
15 16  
... ...
app/models/profile.rb
... ... @@ -138,6 +138,8 @@ class Profile < ActiveRecord::Base
138 138  
139 139 has_many :comments_received, :class_name => 'Comment', :through => :articles, :source => :comments
140 140  
  141 + has_many :search_terms, :as => :context
  142 +
141 143 def scraps(scrap=nil)
142 144 scrap = scrap.is_a?(Scrap) ? scrap.id : scrap
143 145 scrap.nil? ? Scrap.all_scraps(self) : Scrap.all_scraps(self).find(scrap)
... ...
app/models/search_term.rb 0 → 100644
... ... @@ -0,0 +1,34 @@
  1 +class SearchTerm < ActiveRecord::Base
  2 + validates_presence_of :term, :context
  3 + validates_uniqueness_of :term, :scope => [:context_id, :context_type, :asset]
  4 +
  5 + belongs_to :context, :polymorphic => true
  6 + has_many :occurrences, :class_name => 'SearchTermOccurrence'
  7 +
  8 + attr_accessible :term, :context, :asset
  9 +
  10 + def self.calculate_scores
  11 + find_each { |search_term| search_term.calculate_score }
  12 + end
  13 +
  14 + def self.find_or_create(term, context, asset='all')
  15 + context.search_terms.where(:term => term, :asset => asset).first || context.search_terms.create!(:term => term, :asset=> asset)
  16 + end
  17 +
  18 + before_save :calculate_score
  19 +
  20 + def calculate_score
  21 + valid_occurrences = occurrences.valid
  22 + if valid_occurrences.present?
  23 + indexed = valid_occurrences.last.indexed
  24 + total = valid_occurrences.last.total
  25 + # Using the formula described on this paper: http://www.soi.city.ac.uk/~ser/papers/RSJ76.pdf
  26 + current_relevance = indexed > 0 && total >= indexed ? -Math.log(indexed.to_f/total.to_f) : 0
  27 + # Damp number of occurrences with log function to decrease it's effect over relevance.
  28 + damped_occurrences = Math.log(valid_occurrences.count)
  29 + self.score = (damped_occurrences * current_relevance).to_f
  30 + else
  31 + self.score = 0
  32 + end
  33 + end
  34 +end
... ...
app/models/search_term_occurrence.rb 0 → 100644
... ... @@ -0,0 +1,10 @@
  1 +class SearchTermOccurrence < ActiveRecord::Base
  2 + belongs_to :search_term
  3 + validates_presence_of :search_term
  4 + attr_accessible :search_term, :created_at, :total, :indexed
  5 +
  6 + #TODO Verify this value
  7 + EXPIRATION_TIME = 1.month
  8 +
  9 + scope :valid, :conditions => ["search_term_occurrences.created_at >= ?", DateTime.now - EXPIRATION_TIME]
  10 +end
... ...
db/migrate/20140507195924_create_search_term_occurrences.rb 0 → 100644
... ... @@ -0,0 +1,16 @@
  1 +class CreateSearchTermOccurrences < ActiveRecord::Migration
  2 + def up
  3 + create_table :search_term_occurrences do |t|
  4 + t.references :search_term
  5 + t.datetime :created_at
  6 + t.integer :total, :default => 0
  7 + t.integer :indexed, :default => 0
  8 + end
  9 + add_index :search_term_occurrences, :created_at
  10 + end
  11 +
  12 + def down
  13 + remove_index :search_term_occurrences, :created_at
  14 + drop_table :search_term_occurrences
  15 + end
  16 +end
... ...
db/migrate/20140507205338_create_search_terms.rb 0 → 100644
... ... @@ -0,0 +1,17 @@
  1 +class CreateSearchTerms < ActiveRecord::Migration
  2 + def up
  3 + create_table :search_terms do |t|
  4 + t.string :term
  5 + t.references :context, :polymorphic => true
  6 + t.string :asset, :default => 'all'
  7 + t.float :score, :default => 0
  8 + end
  9 +
  10 + add_index :search_terms, [:term, :asset, :score]
  11 + end
  12 +
  13 + def down
  14 + remove_index :search_terms, [:term, :asset, :score]
  15 + drop_table :search_terms
  16 + end
  17 +end
... ...
db/schema.rb
... ... @@ -11,7 +11,7 @@
11 11 #
12 12 # It's strongly recommended to check this file into your version control system.
13 13  
14   -ActiveRecord::Schema.define(:version => 20140408172149) do
  14 +ActiveRecord::Schema.define(:version => 20140507205338) do
15 15  
16 16 create_table "abuse_reports", :force => true do |t|
17 17 t.integer "reporter_id"
... ... @@ -550,6 +550,25 @@ ActiveRecord::Schema.define(:version =&gt; 20140408172149) do
550 550 t.integer "context_id"
551 551 end
552 552  
  553 + create_table "search_term_occurrences", :force => true do |t|
  554 + t.integer "search_term_id"
  555 + t.datetime "created_at"
  556 + t.integer "total", :default => 0
  557 + t.integer "indexed", :default => 0
  558 + end
  559 +
  560 + add_index "search_term_occurrences", ["created_at"], :name => "index_search_term_occurrences_on_created_at"
  561 +
  562 + create_table "search_terms", :force => true do |t|
  563 + t.string "term"
  564 + t.integer "context_id"
  565 + t.string "context_type"
  566 + t.string "asset", :default => "all"
  567 + t.float "score", :default => 0.0
  568 + end
  569 +
  570 + add_index "search_terms", ["term", "asset", "score"], :name => "index_search_terms_on_term_and_asset_and_score"
  571 +
553 572 create_table "sessions", :force => true do |t|
554 573 t.string "session_id", :null => false
555 574 t.text "data"
... ...
test/unit/search_term_occurrence_test.rb 0 → 100644
... ... @@ -0,0 +1,30 @@
  1 +require 'test_helper'
  2 +
  3 +class SearchTermOccurrenceTest < ActiveSupport::TestCase
  4 +
  5 + def setup
  6 + @search_term = SearchTerm.find_or_create('universe', Environment.default)
  7 + end
  8 +
  9 + attr_reader :search_term
  10 +
  11 + should 'have term' do
  12 + search_term_occurrence = SearchTermOccurrence.new
  13 + assert !search_term_occurrence.valid?
  14 + assert search_term_occurrence.errors.has_key?(:search_term)
  15 + end
  16 +
  17 + should 'create a search term occurence' do
  18 + assert_nothing_raised do
  19 + SearchTermOccurrence.create!(:search_term => search_term)
  20 + end
  21 + end
  22 +
  23 + should 'fetch only valid occurrences' do
  24 + o1 = SearchTermOccurrence.create!(:search_term => search_term)
  25 + o2 = SearchTermOccurrence.create!(:search_term => search_term)
  26 + o3 = SearchTermOccurrence.create!(:search_term => search_term, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day))
  27 +
  28 + assert_equivalent [o1,o2], search_term.occurrences.valid
  29 + end
  30 +end
... ...
test/unit/search_term_test.rb 0 → 100644
... ... @@ -0,0 +1,81 @@
  1 +require 'test_helper'
  2 +
  3 +class SearchTermTest < ActiveSupport::TestCase
  4 + should 'have term' do
  5 + search_term = SearchTerm.new
  6 + assert !search_term.valid?
  7 + assert search_term.errors.has_key?(:term)
  8 + end
  9 +
  10 + should 'have context' do
  11 + search_term = SearchTerm.new
  12 + assert !search_term.valid?
  13 + assert search_term.errors.has_key?(:context)
  14 + end
  15 +
  16 + should 'have unique term within specific context and asset' do
  17 + SearchTerm.create!(:term => 'galaxy', :context => Environment.default, :asset => 'universe')
  18 + search_term = SearchTerm.new(:term => 'galaxy', :context => Environment.default, :asset => 'universe')
  19 + search_term.valid?
  20 + assert search_term.errors.has_key?(:term)
  21 +
  22 + search_term.asset = 'alternate_universe'
  23 + search_term.valid?
  24 + assert !search_term.errors.has_key?(:term)
  25 + end
  26 +
  27 + should 'create a search term' do
  28 + assert_nothing_raised do
  29 + SearchTerm.create!(:term => 'universe', :context => Environment.default)
  30 + end
  31 + end
  32 +
  33 + should 'find or create by term' do
  34 + assert_difference 'SearchTerm.count', 1 do
  35 + SearchTerm.find_or_create('universe', Environment.default)
  36 + search_term = SearchTerm.find_or_create('universe', Environment.default)
  37 + assert_equal 'universe', search_term.term
  38 + end
  39 + end
  40 +
  41 + should 'have occurrences' do
  42 + search_term = SearchTerm.find_or_create('universe', Environment.default)
  43 + o1 = SearchTermOccurrence.create!(:search_term => search_term)
  44 + o2 = SearchTermOccurrence.create!(:search_term => search_term)
  45 +
  46 + assert_equivalent [o1,o2], search_term.occurrences
  47 + end
  48 +
  49 + should 'calculate score' do
  50 + search_term = SearchTerm.find_or_create('universe', Environment.default)
  51 + occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3)
  52 + search_term.save!
  53 + assert search_term.score > 0, "Score was not calculated."
  54 + end
  55 +
  56 + should 'not consider expired occurrences to calculate the score' do
  57 + search_term = SearchTerm.find_or_create('universe', Environment.default)
  58 + occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day))
  59 + search_term.save!
  60 + assert search_term.score == 0, "Considered expired occurrence to calculate the score."
  61 + end
  62 +
  63 + should 'calculate search_terms scores' do
  64 + st1 = SearchTerm.create!(:term => 'st1')
  65 + SearchTermOccurrence.create!(:search_term => st1, :total => 10, :indexed => 3)
  66 + SearchTermOccurrence.create!(:search_term => st1, :total => 20, :indexed => 8)
  67 + SearchTermOccurrence.create!(:search_term => st1, :total => 30, :indexed => 9)
  68 + st2 = SearchTerm.create!(:term => 'st2')
  69 + SearchTermOccurrence.create!(:search_term => st2, :total => 10, :indexed => 7)
  70 + SearchTermOccurrence.create!(:search_term => st2, :total => 20, :indexed => 16)
  71 + SearchTermOccurrence.create!(:search_term => st2, :total => 30, :indexed => 21)
  72 +
  73 + SearchTerm.calculate_scores
  74 + st1.reload
  75 + st2.reload
  76 +
  77 + assert st1.score > 0, "Did not calculate st1 score."
  78 + assert st2.score > 0, "Did not calculate st2 score."
  79 + end
  80 +
  81 +end
... ...