begin require 'rdig' rescue LoadError end module ActsAsFerret # The RdigAdapter is automatically included into your model if you specify # the +:rdig+ options hash in your call to acts_as_ferret. It overrides # several methods declared by aaf to retrieve documents with the help of # RDig's http crawler when you call rebuild_index. module RdigAdapter if defined?(RDig) def self.included(target) target.extend ClassMethods target.send :include, InstanceMethods end # Indexer class to replace RDig's original indexer class Indexer include MonitorMixin def initialize(batch_size, model_class, &block) @batch_size = batch_size @model_class = model_class @documents = [] @offset = 0 @block = block super() end def add(doc) synchronize do @documents << @model_class.new(doc.uri.to_s, doc) process_batch if @documents.size >= @batch_size end end alias << add def close synchronize do process_batch end end protected def process_batch ActsAsFerret::logger.info "RdigAdapter::Indexer#process_batch: #{@documents.size} docs in queue, offset #{@offset}" @block.call @documents, @offset @offset += @documents.size @documents = [] end end module ClassMethods # overriding aaf to return the documents fetched via RDig def records_for_rebuild(batch_size = 1000, &block) indexer = Indexer.new(batch_size, self, &block) configure_rdig do crawler = RDig::Crawler.new RDig.configuration, ActsAsFerret::logger crawler.instance_variable_set '@indexer', indexer ActsAsFerret::logger.debug "now crawling..." crawler.crawl end rescue => e ActsAsFerret::logger.error e ActsAsFerret::logger.debug e.backtrace.join("\n") ensure indexer.close if indexer end # overriding aaf to skip reindexing records changed during the rebuild # when rebuilding with the rake task def records_modified_since(time) [] end # unfortunately need to modify global RDig.configuration because it's # used everywhere in RDig def configure_rdig # back up original config old_logger = RDig.logger old_cfg = RDig.configuration.dup RDig.logger = ActsAsFerret.logger rdig_configuration[:crawler].each { |k,v| RDig.configuration.crawler.send :"#{k}=", v } if rdig_configuration[:crawler] if ce_config = rdig_configuration[:content_extraction] RDig.configuration.content_extraction = OpenStruct.new( :hpricot => OpenStruct.new( ce_config ) ) end yield ensure # restore original config RDig.configuration.crawler = old_cfg.crawler RDig.configuration.content_extraction = old_cfg.content_extraction RDig.logger = old_logger end # overriding aaf to enforce loading page title and content from the # ferret index def find_with_ferret(q, options = {}, find_options = {}) options[:lazy] = true super end def find_for_id(id) new id end end module InstanceMethods def initialize(uri, rdig_document = nil) @id = uri @rdig_document = rdig_document end # Title of the document. # Use the +:title_tag_selector+ option to declare the hpricot expression # that should be used for selecting the content for this field. def title @rdig_document.title end # Content of the document. # Use the +:content_tag_selector+ option to declare the hpricot expression # that should be used for selecting the content for this field. def content @rdig_document.body end # Url of this document. def id @id end def to_s "Page at #{id}, title: #{title}" end end end end end