|
|
@@ -6,6 +6,11 @@ module Agents
|
6
|
6
|
class WebsiteAgent < Agent
|
7
|
7
|
cannot_receive_events!
|
8
|
8
|
|
|
9
|
+ default_schedule "every_12h"
|
|
10
|
+
|
|
11
|
+ UNIQUENESS_LOOK_BACK = 200
|
|
12
|
+ UNIQUENESS_FACTOR = 3
|
|
13
|
+
|
9
|
14
|
description <<-MD
|
10
|
15
|
The WebsiteAgent scrapes a website, XML document, or JSON feed and creates Events based on the results.
|
11
|
16
|
|
|
|
@@ -34,20 +39,15 @@ module Agents
|
34
|
39
|
|
35
|
40
|
Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`.
|
36
|
41
|
|
37
|
|
- Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent (only used to set the "working" status).
|
|
42
|
+ Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent. This is only used to set the "working" status.
|
38
|
43
|
|
39
|
|
- Set `uniqueness_look_back` (defaults to the larger of 200, 3x the number of received events) to limit the number of events checked for uniqueness (typically for performance).
|
|
44
|
+ Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
|
40
|
45
|
MD
|
41
|
46
|
|
42
|
47
|
event_description do
|
43
|
48
|
"Events will have the fields you specified. Your options look like:\n\n #{Utils.pretty_print options['extract']}"
|
44
|
49
|
end
|
45
|
50
|
|
46
|
|
- default_schedule "every_12h"
|
47
|
|
-
|
48
|
|
- UNIQUENESS_LOOK_BACK = 200
|
49
|
|
- UNIQUENESS_FACTOR = 3
|
50
|
|
-
|
51
|
51
|
def working?
|
52
|
52
|
event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs?
|
53
|
53
|
end
|
|
|
@@ -66,54 +66,44 @@ module Agents
|
66
|
66
|
end
|
67
|
67
|
|
68
|
68
|
def validate_options
|
69
|
|
- # Check required fields are present
|
|
69
|
+ # Check for required fields
|
70
|
70
|
errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present?
|
71
|
71
|
if !options['extract'].present? && extraction_type != "json"
|
72
|
72
|
errors.add(:base, "extract is required for all types except json")
|
73
|
73
|
end
|
74
|
|
- # Check options:
|
|
74
|
+
|
|
75
|
+ # Check for optional fields
|
75
|
76
|
if options['mode'].present?
|
76
|
|
- if options['mode'] != "on_change" && options['mode'] != "all"
|
77
|
|
- errors.add(:base, "mode should be all or on_change")
|
78
|
|
- end
|
|
77
|
+ errors.add(:base, "mode must be set to on_change or all") unless %w[on_change all].include?(options['mode'])
|
79
|
78
|
end
|
80
|
|
- # Check integer variables:
|
|
79
|
+
|
81
|
80
|
if options['expected_update_period_in_days'].present?
|
82
|
|
- begin
|
83
|
|
- Integer(options['expected_update_period_in_days'])
|
84
|
|
- rescue
|
85
|
|
- errors.add(:base, "Invalid expected_update_period_in_days format")
|
86
|
|
- end
|
|
81
|
+ errors.add(:base, "Invalid expected_update_period_in_days format") unless is_positive_integer?(options['expected_update_period_in_days'])
|
87
|
82
|
end
|
|
83
|
+
|
88
|
84
|
if options['uniqueness_look_back'].present?
|
89
|
|
- begin
|
90
|
|
- Integer(options['uniqueness_look_back'])
|
91
|
|
- rescue
|
92
|
|
- errors.add(:base, "Invalid uniqueness_look_back format")
|
93
|
|
- end
|
|
85
|
+ errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back'])
|
94
|
86
|
end
|
95
|
87
|
end
|
96
|
88
|
|
97
|
89
|
def check
|
98
|
90
|
hydra = Typhoeus::Hydra.new
|
99
|
91
|
log "Fetching #{options['url']}"
|
100
|
|
- request_opts = {:followlocation => true}
|
101
|
|
- if options['basic_auth'].present?
|
102
|
|
- request_opts[:userpwd] = options['basic_auth']
|
103
|
|
- end
|
|
92
|
+ request_opts = { :followlocation => true }
|
|
93
|
+ request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present?
|
104
|
94
|
request = Typhoeus::Request.new(options['url'], request_opts)
|
|
95
|
+
|
105
|
96
|
request.on_failure do |response|
|
106
|
97
|
error "Failed: #{response.inspect}"
|
107
|
98
|
end
|
|
99
|
+
|
108
|
100
|
request.on_success do |response|
|
109
|
101
|
doc = parse(response.body)
|
110
|
102
|
|
111
|
103
|
if extract_full_json?
|
112
|
|
- old_events = previous_payloads 1
|
113
|
|
- result = doc
|
114
|
|
- if store_payload? old_events, result
|
115
|
|
- log "Storing new result for '#{name}': #{result.inspect}"
|
116
|
|
- create_event :payload => result
|
|
104
|
+ if store_payload!(previous_payloads(1), doc)
|
|
105
|
+ log "Storing new result for '#{name}': #{doc.inspect}"
|
|
106
|
+ create_event :payload => doc
|
117
|
107
|
end
|
118
|
108
|
else
|
119
|
109
|
output = {}
|
|
|
@@ -152,7 +142,7 @@ module Agents
|
152
|
142
|
end
|
153
|
143
|
end
|
154
|
144
|
|
155
|
|
- if store_payload? old_events, result
|
|
145
|
+ if store_payload!(old_events, result)
|
156
|
146
|
log "Storing new parsed result for '#{name}': #{result.inspect}"
|
157
|
147
|
create_event :payload => result
|
158
|
148
|
end
|
|
|
@@ -165,16 +155,20 @@ module Agents
|
165
|
155
|
|
166
|
156
|
private
|
167
|
157
|
|
168
|
|
- def store_payload?(old_events, result)
|
169
|
|
- if !options['mode']
|
|
158
|
+ # This method returns true if the result should be stored as a new event.
|
|
159
|
+ # If mode is set to 'on_change', this method may return false and update an existing
|
|
160
|
+ # event to expire further in the future.
|
|
161
|
+ def store_payload!(old_events, result)
|
|
162
|
+ if !options['mode'].present?
|
170
|
163
|
return true
|
171
|
164
|
elsif options['mode'].to_s == "all"
|
172
|
165
|
return true
|
173
|
166
|
elsif options['mode'].to_s == "on_change"
|
|
167
|
+ result_json = result.to_json
|
174
|
168
|
old_events.each do |old_event|
|
175
|
|
- if old_event.payload.to_json == result.to_json
|
|
169
|
+ if old_event.payload.to_json == result_json
|
176
|
170
|
old_event.expires_at = new_event_expiration_date
|
177
|
|
- old_event.save
|
|
171
|
+ old_event.save!
|
178
|
172
|
return false
|
179
|
173
|
end
|
180
|
174
|
end
|
|
|
@@ -187,8 +181,8 @@ module Agents
|
187
|
181
|
if options['uniqueness_look_back'].present?
|
188
|
182
|
look_back = options['uniqueness_look_back'].to_i
|
189
|
183
|
else
|
190
|
|
- # Larger of UNIQUENESS_FACTOR*num_events and UNIQUENESS_LOOK_BACK
|
191
|
|
- look_back = UNIQUENESS_FACTOR*num_events
|
|
184
|
+ # Larger of UNIQUENESS_FACTOR * num_events and UNIQUENESS_LOOK_BACK
|
|
185
|
+ look_back = UNIQUENESS_FACTOR * num_events
|
192
|
186
|
if look_back < UNIQUENESS_LOOK_BACK
|
193
|
187
|
look_back = UNIQUENESS_LOOK_BACK
|
194
|
188
|
end
|
|
|
@@ -197,7 +191,7 @@ module Agents
|
197
|
191
|
end
|
198
|
192
|
|
199
|
193
|
def extract_full_json?
|
200
|
|
- (!options['extract'].present? && extraction_type == "json")
|
|
194
|
+ !options['extract'].present? && extraction_type == "json"
|
201
|
195
|
end
|
202
|
196
|
|
203
|
197
|
def extraction_type
|
|
|
@@ -224,5 +218,13 @@ module Agents
|
224
|
218
|
raise "Unknown extraction type #{extraction_type}"
|
225
|
219
|
end
|
226
|
220
|
end
|
|
221
|
+
|
|
222
|
+ def is_positive_integer?(value)
|
|
223
|
+ begin
|
|
224
|
+ Integer(value) >= 0
|
|
225
|
+ rescue
|
|
226
|
+ false
|
|
227
|
+ end
|
|
228
|
+ end
|
227
|
229
|
end
|
228
|
230
|
end
|