rdig_adapter.rb 4.2 KB
Edit Raw Blame History



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141


begin
  require 'rdig'
rescue LoadError
end
module ActsAsFerret

  # The RdigAdapter is automatically included into your model if you specify
  # the +:rdig+ options hash in your call to acts_as_ferret. It overrides
  # several methods declared by aaf to retrieve documents with the help of
  # RDig's http crawler when you call rebuild_index.
  module RdigAdapter

    if defined?(RDig)

      def self.included(target)
        target.extend ClassMethods
        target.send :include, InstanceMethods
      end

      # Indexer class to replace RDig's original indexer
      class Indexer
        include MonitorMixin
        def initialize(batch_size, model_class, &block)
          @batch_size = batch_size
          @model_class = model_class
          @documents = []
          @offset = 0
          @block = block
          super()
        end

        def add(doc)
          synchronize do
            @documents << @model_class.new(doc.uri.to_s, doc)
            process_batch if @documents.size >= @batch_size
          end
        end
        alias << add

        def close
          synchronize do
            process_batch
          end
        end

        protected
        def process_batch
          ActsAsFerret::logger.info "RdigAdapter::Indexer#process_batch: #{@documents.size} docs in queue, offset #{@offset}"
          @block.call @documents, @offset
          @offset += @documents.size
          @documents = []
        end
      end
      
      module ClassMethods
        # overriding aaf to return the documents fetched via RDig
        def records_for_rebuild(batch_size = 1000, &block)
          indexer = Indexer.new(batch_size, self, &block)
          configure_rdig do
            crawler = RDig::Crawler.new RDig.configuration, ActsAsFerret::logger
            crawler.instance_variable_set '@indexer', indexer
            ActsAsFerret::logger.debug "now crawling..."
            crawler.crawl
          end
        rescue => e
          ActsAsFerret::logger.error e
          ActsAsFerret::logger.debug e.backtrace.join("\n")
        ensure
          indexer.close if indexer
        end

        # overriding aaf to skip reindexing records changed during the rebuild
        # when rebuilding with the rake task
        def records_modified_since(time)
          []
        end

        # unfortunately need to modify global RDig.configuration because it's
        # used everywhere in RDig
        def configure_rdig
          # back up original config
          old_logger = RDig.logger
          old_cfg = RDig.configuration.dup
          RDig.logger = ActsAsFerret.logger
          rdig_configuration[:crawler].each { |k,v| RDig.configuration.crawler.send :"#{k}=", v } if rdig_configuration[:crawler]
          if ce_config = rdig_configuration[:content_extraction]
            RDig.configuration.content_extraction = OpenStruct.new( :hpricot => OpenStruct.new( ce_config ) )
          end
          yield
        ensure
          # restore original config
          RDig.configuration.crawler = old_cfg.crawler
          RDig.configuration.content_extraction = old_cfg.content_extraction
          RDig.logger = old_logger
        end

        # overriding aaf to enforce loading page title and content from the
        # ferret index
        def find_with_ferret(q, options = {}, find_options = {})
          options[:lazy] = true
          super
        end

        def find_for_id(id)
          new id
        end
      end

      module InstanceMethods
        def initialize(uri, rdig_document = nil)
          @id = uri
          @rdig_document = rdig_document
        end

        # Title of the document.
        # Use the +:title_tag_selector+ option to declare the hpricot expression
        # that should be used for selecting the content for this field.
        def title
          @rdig_document.title
        end

        # Content of the document.
        # Use the +:content_tag_selector+ option to declare the hpricot expression
        # that should be used for selecting the content for this field.
        def content
          @rdig_document.body
        end

        # Url of this document.
        def id
          @id
        end

        def to_s
          "Page at #{id}, title: #{title}"
        end
      end
    end
  end
  
end