Commit b374f2c57ee9a01617191cb02753613d6e47d674
1 parent
ab66aa9b
Exists in
master
and in
29 other branches
[search-improvements] Add infrastructure for search_terms score calculation
Showing
9 changed files
with
211 additions
and
1 deletions
Show diff stats
app/models/environment.rb
@@ -10,6 +10,7 @@ class Environment < ActiveRecord::Base | @@ -10,6 +10,7 @@ class Environment < ActiveRecord::Base | ||
10 | self.partial_updates = false | 10 | self.partial_updates = false |
11 | 11 | ||
12 | has_many :tasks, :dependent => :destroy, :as => 'target' | 12 | has_many :tasks, :dependent => :destroy, :as => 'target' |
13 | + has_many :search_terms, :as => :context | ||
13 | 14 | ||
14 | IDENTIFY_SCRIPTS = /(php[0-9s]?|[sp]htm[l]?|pl|py|cgi|rb)/ | 15 | IDENTIFY_SCRIPTS = /(php[0-9s]?|[sp]htm[l]?|pl|py|cgi|rb)/ |
15 | 16 |
app/models/profile.rb
@@ -138,6 +138,8 @@ class Profile < ActiveRecord::Base | @@ -138,6 +138,8 @@ class Profile < ActiveRecord::Base | ||
138 | 138 | ||
139 | has_many :comments_received, :class_name => 'Comment', :through => :articles, :source => :comments | 139 | has_many :comments_received, :class_name => 'Comment', :through => :articles, :source => :comments |
140 | 140 | ||
141 | + has_many :search_terms, :as => :context | ||
142 | + | ||
141 | def scraps(scrap=nil) | 143 | def scraps(scrap=nil) |
142 | scrap = scrap.is_a?(Scrap) ? scrap.id : scrap | 144 | scrap = scrap.is_a?(Scrap) ? scrap.id : scrap |
143 | scrap.nil? ? Scrap.all_scraps(self) : Scrap.all_scraps(self).find(scrap) | 145 | scrap.nil? ? Scrap.all_scraps(self) : Scrap.all_scraps(self).find(scrap) |
@@ -0,0 +1,34 @@ | @@ -0,0 +1,34 @@ | ||
1 | +class SearchTerm < ActiveRecord::Base | ||
2 | + validates_presence_of :term, :context | ||
3 | + validates_uniqueness_of :term, :scope => [:context_id, :context_type, :asset] | ||
4 | + | ||
5 | + belongs_to :context, :polymorphic => true | ||
6 | + has_many :occurrences, :class_name => 'SearchTermOccurrence' | ||
7 | + | ||
8 | + attr_accessible :term, :context, :asset | ||
9 | + | ||
10 | + def self.calculate_scores | ||
11 | + find_each { |search_term| search_term.calculate_score } | ||
12 | + end | ||
13 | + | ||
14 | + def self.find_or_create(term, context, asset='all') | ||
15 | + context.search_terms.where(:term => term, :asset => asset).first || context.search_terms.create!(:term => term, :asset=> asset) | ||
16 | + end | ||
17 | + | ||
18 | + before_save :calculate_score | ||
19 | + | ||
20 | + def calculate_score | ||
21 | + valid_occurrences = occurrences.valid | ||
22 | + if valid_occurrences.present? | ||
23 | + indexed = valid_occurrences.last.indexed | ||
24 | + total = valid_occurrences.last.total | ||
25 | + # Using the formula described on this paper: http://www.soi.city.ac.uk/~ser/papers/RSJ76.pdf | ||
26 | + current_relevance = indexed > 0 && total >= indexed ? -Math.log(indexed.to_f/total.to_f) : 0 | ||
27 | + # Damp number of occurrences with log function to decrease it's effect over relevance. | ||
28 | + damped_occurrences = Math.log(valid_occurrences.count) | ||
29 | + self.score = (damped_occurrences * current_relevance).to_f | ||
30 | + else | ||
31 | + self.score = 0 | ||
32 | + end | ||
33 | + end | ||
34 | +end |
@@ -0,0 +1,10 @@ | @@ -0,0 +1,10 @@ | ||
1 | +class SearchTermOccurrence < ActiveRecord::Base | ||
2 | + belongs_to :search_term | ||
3 | + validates_presence_of :search_term | ||
4 | + attr_accessible :search_term, :created_at, :total, :indexed | ||
5 | + | ||
6 | + #TODO Verify this value | ||
7 | + EXPIRATION_TIME = 1.month | ||
8 | + | ||
9 | + scope :valid, :conditions => ["search_term_occurrences.created_at >= ?", DateTime.now - EXPIRATION_TIME] | ||
10 | +end |
db/migrate/20140507195924_create_search_term_occurrences.rb
0 → 100644
@@ -0,0 +1,16 @@ | @@ -0,0 +1,16 @@ | ||
1 | +class CreateSearchTermOccurrences < ActiveRecord::Migration | ||
2 | + def up | ||
3 | + create_table :search_term_occurrences do |t| | ||
4 | + t.references :search_term | ||
5 | + t.datetime :created_at | ||
6 | + t.integer :total, :default => 0 | ||
7 | + t.integer :indexed, :default => 0 | ||
8 | + end | ||
9 | + add_index :search_term_occurrences, :created_at | ||
10 | + end | ||
11 | + | ||
12 | + def down | ||
13 | + remove_index :search_term_occurrences, :created_at | ||
14 | + drop_table :search_term_occurrences | ||
15 | + end | ||
16 | +end |
@@ -0,0 +1,17 @@ | @@ -0,0 +1,17 @@ | ||
1 | +class CreateSearchTerms < ActiveRecord::Migration | ||
2 | + def up | ||
3 | + create_table :search_terms do |t| | ||
4 | + t.string :term | ||
5 | + t.references :context, :polymorphic => true | ||
6 | + t.string :asset, :default => 'all' | ||
7 | + t.float :score, :default => 0 | ||
8 | + end | ||
9 | + | ||
10 | + add_index :search_terms, [:term, :asset, :score] | ||
11 | + end | ||
12 | + | ||
13 | + def down | ||
14 | + remove_index :search_terms, [:term, :asset, :score] | ||
15 | + drop_table :search_terms | ||
16 | + end | ||
17 | +end |
db/schema.rb
@@ -11,7 +11,7 @@ | @@ -11,7 +11,7 @@ | ||
11 | # | 11 | # |
12 | # It's strongly recommended to check this file into your version control system. | 12 | # It's strongly recommended to check this file into your version control system. |
13 | 13 | ||
14 | -ActiveRecord::Schema.define(:version => 20140408172149) do | 14 | +ActiveRecord::Schema.define(:version => 20140507205338) do |
15 | 15 | ||
16 | create_table "abuse_reports", :force => true do |t| | 16 | create_table "abuse_reports", :force => true do |t| |
17 | t.integer "reporter_id" | 17 | t.integer "reporter_id" |
@@ -550,6 +550,25 @@ ActiveRecord::Schema.define(:version => 20140408172149) do | @@ -550,6 +550,25 @@ ActiveRecord::Schema.define(:version => 20140408172149) do | ||
550 | t.integer "context_id" | 550 | t.integer "context_id" |
551 | end | 551 | end |
552 | 552 | ||
553 | + create_table "search_term_occurrences", :force => true do |t| | ||
554 | + t.integer "search_term_id" | ||
555 | + t.datetime "created_at" | ||
556 | + t.integer "total", :default => 0 | ||
557 | + t.integer "indexed", :default => 0 | ||
558 | + end | ||
559 | + | ||
560 | + add_index "search_term_occurrences", ["created_at"], :name => "index_search_term_occurrences_on_created_at" | ||
561 | + | ||
562 | + create_table "search_terms", :force => true do |t| | ||
563 | + t.string "term" | ||
564 | + t.integer "context_id" | ||
565 | + t.string "context_type" | ||
566 | + t.string "asset", :default => "all" | ||
567 | + t.float "score", :default => 0.0 | ||
568 | + end | ||
569 | + | ||
570 | + add_index "search_terms", ["term", "asset", "score"], :name => "index_search_terms_on_term_and_asset_and_score" | ||
571 | + | ||
553 | create_table "sessions", :force => true do |t| | 572 | create_table "sessions", :force => true do |t| |
554 | t.string "session_id", :null => false | 573 | t.string "session_id", :null => false |
555 | t.text "data" | 574 | t.text "data" |
@@ -0,0 +1,30 @@ | @@ -0,0 +1,30 @@ | ||
1 | +require 'test_helper' | ||
2 | + | ||
3 | +class SearchTermOccurrenceTest < ActiveSupport::TestCase | ||
4 | + | ||
5 | + def setup | ||
6 | + @search_term = SearchTerm.find_or_create('universe', Environment.default) | ||
7 | + end | ||
8 | + | ||
9 | + attr_reader :search_term | ||
10 | + | ||
11 | + should 'have term' do | ||
12 | + search_term_occurrence = SearchTermOccurrence.new | ||
13 | + assert !search_term_occurrence.valid? | ||
14 | + assert search_term_occurrence.errors.has_key?(:search_term) | ||
15 | + end | ||
16 | + | ||
17 | + should 'create a search term occurence' do | ||
18 | + assert_nothing_raised do | ||
19 | + SearchTermOccurrence.create!(:search_term => search_term) | ||
20 | + end | ||
21 | + end | ||
22 | + | ||
23 | + should 'fetch only valid occurrences' do | ||
24 | + o1 = SearchTermOccurrence.create!(:search_term => search_term) | ||
25 | + o2 = SearchTermOccurrence.create!(:search_term => search_term) | ||
26 | + o3 = SearchTermOccurrence.create!(:search_term => search_term, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) | ||
27 | + | ||
28 | + assert_equivalent [o1,o2], search_term.occurrences.valid | ||
29 | + end | ||
30 | +end |
@@ -0,0 +1,81 @@ | @@ -0,0 +1,81 @@ | ||
1 | +require 'test_helper' | ||
2 | + | ||
3 | +class SearchTermTest < ActiveSupport::TestCase | ||
4 | + should 'have term' do | ||
5 | + search_term = SearchTerm.new | ||
6 | + assert !search_term.valid? | ||
7 | + assert search_term.errors.has_key?(:term) | ||
8 | + end | ||
9 | + | ||
10 | + should 'have context' do | ||
11 | + search_term = SearchTerm.new | ||
12 | + assert !search_term.valid? | ||
13 | + assert search_term.errors.has_key?(:context) | ||
14 | + end | ||
15 | + | ||
16 | + should 'have unique term within specific context and asset' do | ||
17 | + SearchTerm.create!(:term => 'galaxy', :context => Environment.default, :asset => 'universe') | ||
18 | + search_term = SearchTerm.new(:term => 'galaxy', :context => Environment.default, :asset => 'universe') | ||
19 | + search_term.valid? | ||
20 | + assert search_term.errors.has_key?(:term) | ||
21 | + | ||
22 | + search_term.asset = 'alternate_universe' | ||
23 | + search_term.valid? | ||
24 | + assert !search_term.errors.has_key?(:term) | ||
25 | + end | ||
26 | + | ||
27 | + should 'create a search term' do | ||
28 | + assert_nothing_raised do | ||
29 | + SearchTerm.create!(:term => 'universe', :context => Environment.default) | ||
30 | + end | ||
31 | + end | ||
32 | + | ||
33 | + should 'find or create by term' do | ||
34 | + assert_difference 'SearchTerm.count', 1 do | ||
35 | + SearchTerm.find_or_create('universe', Environment.default) | ||
36 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | ||
37 | + assert_equal 'universe', search_term.term | ||
38 | + end | ||
39 | + end | ||
40 | + | ||
41 | + should 'have occurrences' do | ||
42 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | ||
43 | + o1 = SearchTermOccurrence.create!(:search_term => search_term) | ||
44 | + o2 = SearchTermOccurrence.create!(:search_term => search_term) | ||
45 | + | ||
46 | + assert_equivalent [o1,o2], search_term.occurrences | ||
47 | + end | ||
48 | + | ||
49 | + should 'calculate score' do | ||
50 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | ||
51 | + occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) | ||
52 | + search_term.save! | ||
53 | + assert search_term.score > 0, "Score was not calculated." | ||
54 | + end | ||
55 | + | ||
56 | + should 'not consider expired occurrences to calculate the score' do | ||
57 | + search_term = SearchTerm.find_or_create('universe', Environment.default) | ||
58 | + occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) | ||
59 | + search_term.save! | ||
60 | + assert search_term.score == 0, "Considered expired occurrence to calculate the score." | ||
61 | + end | ||
62 | + | ||
63 | + should 'calculate search_terms scores' do | ||
64 | + st1 = SearchTerm.create!(:term => 'st1') | ||
65 | + SearchTermOccurrence.create!(:search_term => st1, :total => 10, :indexed => 3) | ||
66 | + SearchTermOccurrence.create!(:search_term => st1, :total => 20, :indexed => 8) | ||
67 | + SearchTermOccurrence.create!(:search_term => st1, :total => 30, :indexed => 9) | ||
68 | + st2 = SearchTerm.create!(:term => 'st2') | ||
69 | + SearchTermOccurrence.create!(:search_term => st2, :total => 10, :indexed => 7) | ||
70 | + SearchTermOccurrence.create!(:search_term => st2, :total => 20, :indexed => 16) | ||
71 | + SearchTermOccurrence.create!(:search_term => st2, :total => 30, :indexed => 21) | ||
72 | + | ||
73 | + SearchTerm.calculate_scores | ||
74 | + st1.reload | ||
75 | + st2.reload | ||
76 | + | ||
77 | + assert st1.score > 0, "Did not calculate st1 score." | ||
78 | + assert st2.score > 0, "Did not calculate st2 score." | ||
79 | + end | ||
80 | + | ||
81 | +end |