@@ -34,7 +34,9 @@ module Agents |
||
34 | 34 |
|
35 | 35 |
Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`. |
36 | 36 |
|
37 |
- Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent. |
|
37 |
+ Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent (only used to set the "working" status). |
|
38 |
+ |
|
39 |
+ Set `uniqueness_look_back` (defaults to the larger of 200, 3x the number of received events) to limit the number of events checked for uniqueness (typically for performance). |
|
38 | 40 |
MD |
39 | 41 |
|
40 | 42 |
event_description do |
@@ -43,7 +45,8 @@ module Agents |
||
43 | 45 |
|
44 | 46 |
default_schedule "every_12h" |
45 | 47 |
|
46 |
- UNIQUENESS_LOOK_BACK = 30 |
|
48 |
+ UNIQUENESS_LOOK_BACK = 200 |
|
49 |
+ UNIQUENESS_FACTOR = 3 |
|
47 | 50 |
|
48 | 51 |
def working? |
49 | 52 |
event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs? |
@@ -63,10 +66,32 @@ module Agents |
||
63 | 66 |
end |
64 | 67 |
|
65 | 68 |
def validate_options |
69 |
+ # Check required fields are present |
|
66 | 70 |
errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present? |
67 | 71 |
if !options['extract'].present? && extraction_type != "json" |
68 | 72 |
errors.add(:base, "extract is required for all types except json") |
69 | 73 |
end |
74 |
+ # Check options: |
|
75 |
+ if options['mode'].present? |
|
76 |
+ if options['mode'] != "on_change" && options['mode'] != "all" |
|
77 |
+ errors.add(:base, "mode should be all or on_change") |
|
78 |
+ end |
|
79 |
+ end |
|
80 |
+ # Check integer variables: |
|
81 |
+ if options['expected_update_period_in_days'].present? |
|
82 |
+ begin |
|
83 |
+ Integer(options['expected_update_period_in_days']) |
|
84 |
+ rescue |
|
85 |
+ errors.add(:base, "Invalid expected_update_period_in_days format") |
|
86 |
+ end |
|
87 |
+ end |
|
88 |
+ if options['uniqueness_look_back'].present? |
|
89 |
+ begin |
|
90 |
+ Integer(options['uniqueness_look_back']) |
|
91 |
+ rescue |
|
92 |
+ errors.add(:base, "Invalid uniqueness_look_back format") |
|
93 |
+ end |
|
94 |
+ end |
|
70 | 95 |
end |
71 | 96 |
|
72 | 97 |
def check |
@@ -84,8 +109,9 @@ module Agents |
||
84 | 109 |
doc = parse(response.body) |
85 | 110 |
|
86 | 111 |
if extract_full_json? |
112 |
+ old_events = previous_payloads 1 |
|
87 | 113 |
result = doc |
88 |
- if store_payload? result |
|
114 |
+ if store_payload? old_events, result |
|
89 | 115 |
log "Storing new result for '#{name}': #{result.inspect}" |
90 | 116 |
create_event :payload => result |
91 | 117 |
end |
@@ -116,6 +142,7 @@ module Agents |
||
116 | 142 |
return |
117 | 143 |
end |
118 | 144 |
|
145 |
+ old_events = previous_payloads num_unique_lengths.first |
|
119 | 146 |
num_unique_lengths.first.times do |index| |
120 | 147 |
result = {} |
121 | 148 |
options['extract'].keys.each do |name| |
@@ -125,7 +152,7 @@ module Agents |
||
125 | 152 |
end |
126 | 153 |
end |
127 | 154 |
|
128 |
- if store_payload? result |
|
155 |
+ if store_payload? old_events, result |
|
129 | 156 |
log "Storing new parsed result for '#{name}': #{result.inspect}" |
130 | 157 |
create_event :payload => result |
131 | 158 |
end |
@@ -138,12 +165,35 @@ module Agents |
||
138 | 165 |
|
139 | 166 |
private |
140 | 167 |
|
141 |
- def store_payload? result |
|
142 |
- !options['mode'] || options['mode'].to_s == "all" || (options['mode'].to_s == "on_change" && !previous_payloads.include?(result.to_json)) |
|
168 |
+ def store_payload?(old_events, result) |
|
169 |
+ if !options['mode'] |
|
170 |
+ return true |
|
171 |
+ elsif options['mode'].to_s == "all" |
|
172 |
+ return true |
|
173 |
+ elsif options['mode'].to_s == "on_change" |
|
174 |
+ old_events.each do |old_event| |
|
175 |
+ if old_event.payload.to_json == result.to_json |
|
176 |
+ old_event.expires_at = new_event_expiration_date |
|
177 |
+ old_event.save |
|
178 |
+ return false |
|
179 |
+ end |
|
180 |
+ end |
|
181 |
+ return true |
|
182 |
+ end |
|
183 |
+ raise "Illegal options[mode]: " + options['mode'].to_s |
|
143 | 184 |
end |
144 | 185 |
|
145 |
- def previous_payloads |
|
146 |
- events.order("id desc").limit(UNIQUENESS_LOOK_BACK).pluck(:payload).map(&:to_json) if options['mode'].to_s == "on_change" |
|
186 |
+ def previous_payloads(num_events) |
|
187 |
+ if options['uniqueness_look_back'].present? |
|
188 |
+ look_back = options['uniqueness_look_back'].to_i |
|
189 |
+ else |
|
190 |
+ # Larger of UNIQUENESS_FACTOR*num_events and UNIQUENESS_LOOK_BACK |
|
191 |
+ look_back = UNIQUENESS_FACTOR*num_events |
|
192 |
+ if look_back < UNIQUENESS_LOOK_BACK |
|
193 |
+ look_back = UNIQUENESS_LOOK_BACK |
|
194 |
+ end |
|
195 |
+ end |
|
196 |
+ events.order("id desc").limit(look_back) if options['mode'].to_s == "on_change" |
|
147 | 197 |
end |
148 | 198 |
|
149 | 199 |
def extract_full_json? |
@@ -15,15 +15,31 @@ describe Agents::WebsiteAgent do |
||
15 | 15 |
'title' => {'css' => "#comic img", 'attr' => "title"} |
16 | 16 |
} |
17 | 17 |
} |
18 |
- @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site) |
|
18 |
+ @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site, :keep_events_for => 2) |
|
19 | 19 |
@checker.user = users(:bob) |
20 | 20 |
@checker.save! |
21 | 21 |
end |
22 | 22 |
|
23 | 23 |
describe "#check" do |
24 |
- it "should check for changes" do |
|
24 |
+ |
|
25 |
+ it "should validate the integer fields" do |
|
26 |
+ @checker.options['expected_update_period_in_days'] = "nonsense" |
|
27 |
+ lambda { @checker.save! }.should raise_error; |
|
28 |
+ @checker.options['expected_update_period_in_days'] = "2" |
|
29 |
+ @checker.options['uniqueness_look_back'] = "nonsense" |
|
30 |
+ lambda { @checker.save! }.should raise_error; |
|
31 |
+ @checker.options['mode'] = "nonsense" |
|
32 |
+ lambda { @checker.save! }.should raise_error; |
|
33 |
+ @checker.options = @site |
|
34 |
+ end |
|
35 |
+ |
|
36 |
+ it "should check for changes (and update Event.expires_at)" do |
|
25 | 37 |
lambda { @checker.check }.should change { Event.count }.by(1) |
38 |
+ event = Event.last |
|
39 |
+ sleep 2 |
|
26 | 40 |
lambda { @checker.check }.should_not change { Event.count } |
41 |
+ update_event = Event.last |
|
42 |
+ update_event.expires_at.should_not == event.expires_at |
|
27 | 43 |
end |
28 | 44 |
|
29 | 45 |
it "should always save events when in :all mode" do |
@@ -35,6 +51,30 @@ describe Agents::WebsiteAgent do |
||
35 | 51 |
}.should change { Event.count }.by(2) |
36 | 52 |
end |
37 | 53 |
|
54 |
+ it "should take uniqueness_look_back into account during deduplication" do |
|
55 |
+ @site['mode'] = 'all' |
|
56 |
+ @checker.options = @site |
|
57 |
+ @checker.check |
|
58 |
+ @checker.check |
|
59 |
+ event = Event.last |
|
60 |
+ event.payload = "{}" |
|
61 |
+ event.save |
|
62 |
+ |
|
63 |
+ lambda { |
|
64 |
+ @site['mode'] = 'on_change' |
|
65 |
+ @site['uniqueness_look_back'] = 2 |
|
66 |
+ @checker.options = @site |
|
67 |
+ @checker.check |
|
68 |
+ }.should_not change { Event.count } |
|
69 |
+ |
|
70 |
+ lambda { |
|
71 |
+ @site['mode'] = 'on_change' |
|
72 |
+ @site['uniqueness_look_back'] = 1 |
|
73 |
+ @checker.options = @site |
|
74 |
+ @checker.check |
|
75 |
+ }.should change { Event.count }.by(1) |
|
76 |
+ end |
|
77 |
+ |
|
38 | 78 |
it "should log an error if the number of results for a set of extraction patterns differs" do |
39 | 79 |
@site['extract']['url']['css'] = "div" |
40 | 80 |
@checker.options = @site |
@@ -79,7 +119,7 @@ describe Agents::WebsiteAgent do |
||
79 | 119 |
'expected_update_period_in_days' => 2, |
80 | 120 |
'type' => "html", |
81 | 121 |
'url' => "http://xkcd.com", |
82 |
- 'mode' => :on_change, |
|
122 |
+ 'mode' => "on_change", |
|
83 | 123 |
'extract' => { |
84 | 124 |
'url' => {'css' => "#topLeft a", 'attr' => "href"}, |
85 | 125 |
'title' => {'css' => "#topLeft a", 'text' => "true"} |
@@ -217,4 +257,4 @@ describe Agents::WebsiteAgent do |
||
217 | 257 |
end |
218 | 258 |
end |
219 | 259 |
|
220 |
-end |
|
260 |
+end |