feed_handler.rb
4.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
require 'feedparser'
require 'open-uri'
# This class is responsible for processing feeds and pass the items to the
# respective container.
#
# The <tt>max_errors</tt> attribute controls how many times it will retry in
# case of failure. If a feed fails for <tt>max_errors+1</tt> times, it will be
# disabled and the last error message will be recorder in the container.
# The default value is *6*, if you need to change it you can do that in your
# config/local.rb file like this:
#
# FeedHandler.max_errors = 10
#
# For the update interval, see FeedUpdater.
class FeedHandler
# The maximum number
cattr_accessor :max_errors
cattr_accessor :disabled_period
self.max_errors = 6
self.disabled_period = 1.week
def parse(content)
raise FeedHandler::ParseError, "Content is nil" if content.nil?
begin
return FeedParser::Feed::new(content.force_encoding('utf-8'))
rescue Exception => ex
raise FeedHandler::ParseError, "Invalid feed format."
end
end
def fetch(address, header = {})
begin
content = ""
block = lambda { |s| content = s.read }
content =
if Rails.env == 'test' && File.exists?(address)
File.read(address)
else
if !valid_url?(address)
raise InvalidUrl.new("\"%s\" is not a valid URL" % address)
end
header.merge!("User-Agent" => "Noosfero/#{Noosfero::VERSION}")
open(address, header, &block)
end
return content
rescue Exception => ex
raise FeedHandler::FetchError, ex.message
end
end
def fetch_through_proxy(address, environment)
header = {}
if address.starts_with?("https://")
header.merge!(:proxy => environment.https_feed_proxy) if environment.https_feed_proxy
else
header.merge!(:proxy => environment.http_feed_proxy) if environment.http_feed_proxy
end
header.merge!(:ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE) if environment.disable_feed_ssl
fetch(address, header)
end
def process(container)
begin
container.class.transaction do
if failed_too_many_times(container) && enough_time_since_last_failure(container)
container.enabled = true
container.update_errors = 0
container.save
end
next unless container.enabled
actually_process_container(container)
container.update_errors = 0
container.finish_fetch
end
rescue Exception => exception
Rails.logger.warn("Unknown error from %s ID %d\n%s" % [container.class.name, container.id, exception.to_s])
Rails.logger.warn("Backtrace:\n%s" % exception.backtrace.join("\n"))
container.reload
container.update_errors += 1
container.error_message = exception.to_s
if container.update_errors > FeedHandler.max_errors
container.fetched_at = Time.now
container.enabled = false
end
begin
container.finish_fetch
rescue Exception => finish_fetch_exception
Rails.logger.warn("Unable to finish fetch from %s ID %d\n%s" % [container.class.name, container.id, finish_fetch_exception.to_s])
Rails.logger.warn("Backtrace:\n%s" % finish_fetch_exception.backtrace.join("\n"))
end
end
end
class InvalidUrl < Exception; end
class ParseError < Exception; end
class FetchError < Exception; end
protected
def actually_process_container(container)
container.clear
if container.environment.enable_feed_proxy
content = fetch_through_proxy(container.address, container.environment)
else
content = fetch(container.address)
end
container.fetched_at = Time.now
parsed_feed = parse(content)
container.feed_title = parsed_feed.title
parsed_feed.items[0..container.limit-1].reverse.each do |item|
container.add_item(item.title, item.link, item.date, item.content)
end
end
def valid_url?(url)
url =~ URI.regexp('http') || url =~ URI.regexp('https')
end
def failed_too_many_times(container)
container.update_errors > FeedHandler.max_errors
end
def enough_time_since_last_failure(container)
container.fetched_at.nil? || container.fetched_at < (Time.now - FeedHandler.disabled_period)
end
end