diff --git a/app/models/environment.rb b/app/models/environment.rb index e3b7f31..7ebb0c7 100644 --- a/app/models/environment.rb +++ b/app/models/environment.rb @@ -10,6 +10,7 @@ class Environment < ActiveRecord::Base self.partial_updates = false has_many :tasks, :dependent => :destroy, :as => 'target' + has_many :search_terms, :as => :context IDENTIFY_SCRIPTS = /(php[0-9s]?|[sp]htm[l]?|pl|py|cgi|rb)/ diff --git a/app/models/profile.rb b/app/models/profile.rb index 1506d8d..7054341 100644 --- a/app/models/profile.rb +++ b/app/models/profile.rb @@ -138,6 +138,8 @@ class Profile < ActiveRecord::Base has_many :comments_received, :class_name => 'Comment', :through => :articles, :source => :comments + has_many :search_terms, :as => :context + def scraps(scrap=nil) scrap = scrap.is_a?(Scrap) ? scrap.id : scrap scrap.nil? ? Scrap.all_scraps(self) : Scrap.all_scraps(self).find(scrap) diff --git a/app/models/search_term.rb b/app/models/search_term.rb new file mode 100644 index 0000000..4d9c9ab --- /dev/null +++ b/app/models/search_term.rb @@ -0,0 +1,34 @@ +class SearchTerm < ActiveRecord::Base + validates_presence_of :term, :context + validates_uniqueness_of :term, :scope => [:context_id, :context_type, :asset] + + belongs_to :context, :polymorphic => true + has_many :occurrences, :class_name => 'SearchTermOccurrence' + + attr_accessible :term, :context, :asset + + def self.calculate_scores + find_each { |search_term| search_term.calculate_score } + end + + def self.find_or_create(term, context, asset='all') + context.search_terms.where(:term => term, :asset => asset).first || context.search_terms.create!(:term => term, :asset=> asset) + end + + before_save :calculate_score + + def calculate_score + valid_occurrences = occurrences.valid + if valid_occurrences.present? + indexed = valid_occurrences.last.indexed + total = valid_occurrences.last.total + # Using the formula described on this paper: http://www.soi.city.ac.uk/~ser/papers/RSJ76.pdf + current_relevance = indexed > 0 && total >= indexed ? -Math.log(indexed.to_f/total.to_f) : 0 + # Damp number of occurrences with log function to decrease it's effect over relevance. + damped_occurrences = Math.log(valid_occurrences.count) + self.score = (damped_occurrences * current_relevance).to_f + else + self.score = 0 + end + end +end diff --git a/app/models/search_term_occurrence.rb b/app/models/search_term_occurrence.rb new file mode 100644 index 0000000..fc3e20a --- /dev/null +++ b/app/models/search_term_occurrence.rb @@ -0,0 +1,10 @@ +class SearchTermOccurrence < ActiveRecord::Base + belongs_to :search_term + validates_presence_of :search_term + attr_accessible :search_term, :created_at, :total, :indexed + + #TODO Verify this value + EXPIRATION_TIME = 1.month + + scope :valid, :conditions => ["search_term_occurrences.created_at >= ?", DateTime.now - EXPIRATION_TIME] +end diff --git a/db/migrate/20140507195924_create_search_term_occurrences.rb b/db/migrate/20140507195924_create_search_term_occurrences.rb new file mode 100644 index 0000000..f4f8793 --- /dev/null +++ b/db/migrate/20140507195924_create_search_term_occurrences.rb @@ -0,0 +1,16 @@ +class CreateSearchTermOccurrences < ActiveRecord::Migration + def up + create_table :search_term_occurrences do |t| + t.references :search_term + t.datetime :created_at + t.integer :total, :default => 0 + t.integer :indexed, :default => 0 + end + add_index :search_term_occurrences, :created_at + end + + def down + remove_index :search_term_occurrences, :created_at + drop_table :search_term_occurrences + end +end diff --git a/db/migrate/20140507205338_create_search_terms.rb b/db/migrate/20140507205338_create_search_terms.rb new file mode 100644 index 0000000..2e01a8d --- /dev/null +++ b/db/migrate/20140507205338_create_search_terms.rb @@ -0,0 +1,17 @@ +class CreateSearchTerms < ActiveRecord::Migration + def up + create_table :search_terms do |t| + t.string :term + t.references :context, :polymorphic => true + t.string :asset, :default => 'all' + t.float :score, :default => 0 + end + + add_index :search_terms, [:term, :asset, :score] + end + + def down + remove_index :search_terms, [:term, :asset, :score] + drop_table :search_terms + end +end diff --git a/db/schema.rb b/db/schema.rb index 98d6ffb..fa3123d 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -11,7 +11,7 @@ # # It's strongly recommended to check this file into your version control system. -ActiveRecord::Schema.define(:version => 20140408172149) do +ActiveRecord::Schema.define(:version => 20140507205338) do create_table "abuse_reports", :force => true do |t| t.integer "reporter_id" @@ -550,6 +550,25 @@ ActiveRecord::Schema.define(:version => 20140408172149) do t.integer "context_id" end + create_table "search_term_occurrences", :force => true do |t| + t.integer "search_term_id" + t.datetime "created_at" + t.integer "total", :default => 0 + t.integer "indexed", :default => 0 + end + + add_index "search_term_occurrences", ["created_at"], :name => "index_search_term_occurrences_on_created_at" + + create_table "search_terms", :force => true do |t| + t.string "term" + t.integer "context_id" + t.string "context_type" + t.string "asset", :default => "all" + t.float "score", :default => 0.0 + end + + add_index "search_terms", ["term", "asset", "score"], :name => "index_search_terms_on_term_and_asset_and_score" + create_table "sessions", :force => true do |t| t.string "session_id", :null => false t.text "data" diff --git a/test/unit/search_term_occurrence_test.rb b/test/unit/search_term_occurrence_test.rb new file mode 100644 index 0000000..7bfedae --- /dev/null +++ b/test/unit/search_term_occurrence_test.rb @@ -0,0 +1,30 @@ +require 'test_helper' + +class SearchTermOccurrenceTest < ActiveSupport::TestCase + + def setup + @search_term = SearchTerm.find_or_create('universe', Environment.default) + end + + attr_reader :search_term + + should 'have term' do + search_term_occurrence = SearchTermOccurrence.new + assert !search_term_occurrence.valid? + assert search_term_occurrence.errors.has_key?(:search_term) + end + + should 'create a search term occurence' do + assert_nothing_raised do + SearchTermOccurrence.create!(:search_term => search_term) + end + end + + should 'fetch only valid occurrences' do + o1 = SearchTermOccurrence.create!(:search_term => search_term) + o2 = SearchTermOccurrence.create!(:search_term => search_term) + o3 = SearchTermOccurrence.create!(:search_term => search_term, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) + + assert_equivalent [o1,o2], search_term.occurrences.valid + end +end diff --git a/test/unit/search_term_test.rb b/test/unit/search_term_test.rb new file mode 100644 index 0000000..6b6fd4a --- /dev/null +++ b/test/unit/search_term_test.rb @@ -0,0 +1,81 @@ +require 'test_helper' + +class SearchTermTest < ActiveSupport::TestCase + should 'have term' do + search_term = SearchTerm.new + assert !search_term.valid? + assert search_term.errors.has_key?(:term) + end + + should 'have context' do + search_term = SearchTerm.new + assert !search_term.valid? + assert search_term.errors.has_key?(:context) + end + + should 'have unique term within specific context and asset' do + SearchTerm.create!(:term => 'galaxy', :context => Environment.default, :asset => 'universe') + search_term = SearchTerm.new(:term => 'galaxy', :context => Environment.default, :asset => 'universe') + search_term.valid? + assert search_term.errors.has_key?(:term) + + search_term.asset = 'alternate_universe' + search_term.valid? + assert !search_term.errors.has_key?(:term) + end + + should 'create a search term' do + assert_nothing_raised do + SearchTerm.create!(:term => 'universe', :context => Environment.default) + end + end + + should 'find or create by term' do + assert_difference 'SearchTerm.count', 1 do + SearchTerm.find_or_create('universe', Environment.default) + search_term = SearchTerm.find_or_create('universe', Environment.default) + assert_equal 'universe', search_term.term + end + end + + should 'have occurrences' do + search_term = SearchTerm.find_or_create('universe', Environment.default) + o1 = SearchTermOccurrence.create!(:search_term => search_term) + o2 = SearchTermOccurrence.create!(:search_term => search_term) + + assert_equivalent [o1,o2], search_term.occurrences + end + + should 'calculate score' do + search_term = SearchTerm.find_or_create('universe', Environment.default) + occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3) + search_term.save! + assert search_term.score > 0, "Score was not calculated." + end + + should 'not consider expired occurrences to calculate the score' do + search_term = SearchTerm.find_or_create('universe', Environment.default) + occurrence = SearchTermOccurrence.create!(:search_term => search_term, :total => 10, :indexed => 3, :created_at => DateTime.now - (SearchTermOccurrence::EXPIRATION_TIME + 1.day)) + search_term.save! + assert search_term.score == 0, "Considered expired occurrence to calculate the score." + end + + should 'calculate search_terms scores' do + st1 = SearchTerm.create!(:term => 'st1') + SearchTermOccurrence.create!(:search_term => st1, :total => 10, :indexed => 3) + SearchTermOccurrence.create!(:search_term => st1, :total => 20, :indexed => 8) + SearchTermOccurrence.create!(:search_term => st1, :total => 30, :indexed => 9) + st2 = SearchTerm.create!(:term => 'st2') + SearchTermOccurrence.create!(:search_term => st2, :total => 10, :indexed => 7) + SearchTermOccurrence.create!(:search_term => st2, :total => 20, :indexed => 16) + SearchTermOccurrence.create!(:search_term => st2, :total => 30, :indexed => 21) + + SearchTerm.calculate_scores + st1.reload + st2.reload + + assert st1.score > 0, "Did not calculate st1 score." + assert st2.score > 0, "Did not calculate st2 score." + end + +end -- libgit2 0.21.2