@@ -34,7 +34,9 @@ module Agents |
||
| 34 | 34 |
|
| 35 | 35 |
Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`. |
| 36 | 36 |
|
| 37 |
- Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent. |
|
| 37 |
+ Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent (only used to set the "working" status). |
|
| 38 |
+ |
|
| 39 |
+ Set `uniqueness_look_back` (defaults to 10000) to limit the number of events checked for uniqueness (typically for performance). |
|
| 38 | 40 |
MD |
| 39 | 41 |
|
| 40 | 42 |
event_description do |
@@ -43,7 +45,7 @@ module Agents |
||
| 43 | 45 |
|
| 44 | 46 |
default_schedule "every_12h" |
| 45 | 47 |
|
| 46 |
- UNIQUENESS_LOOK_BACK = 30 |
|
| 48 |
+ UNIQUENESS_LOOK_BACK = 10000 |
|
| 47 | 49 |
|
| 48 | 50 |
def working? |
| 49 | 51 |
event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs? |
@@ -82,10 +84,11 @@ module Agents |
||
| 82 | 84 |
end |
| 83 | 85 |
request.on_success do |response| |
| 84 | 86 |
doc = parse(response.body) |
| 87 |
+ old_events = previous_payloads |
|
| 85 | 88 |
|
| 86 | 89 |
if extract_full_json? |
| 87 | 90 |
result = doc |
| 88 |
- if store_payload? result |
|
| 91 |
+ if store_payload? old_events, result |
|
| 89 | 92 |
log "Storing new result for '#{name}': #{result.inspect}"
|
| 90 | 93 |
create_event :payload => result |
| 91 | 94 |
end |
@@ -125,7 +128,7 @@ module Agents |
||
| 125 | 128 |
end |
| 126 | 129 |
end |
| 127 | 130 |
|
| 128 |
- if store_payload? result |
|
| 131 |
+ if store_payload? old_events, result |
|
| 129 | 132 |
log "Storing new parsed result for '#{name}': #{result.inspect}"
|
| 130 | 133 |
create_event :payload => result |
| 131 | 134 |
end |
@@ -138,12 +141,27 @@ module Agents |
||
| 138 | 141 |
|
| 139 | 142 |
private |
| 140 | 143 |
|
| 141 |
- def store_payload? result |
|
| 142 |
- !options['mode'] || options['mode'].to_s == "all" || (options['mode'].to_s == "on_change" && !previous_payloads.include?(result.to_json)) |
|
| 144 |
+ def store_payload?(old_events, result) |
|
| 145 |
+ if !options['mode'] |
|
| 146 |
+ return true |
|
| 147 |
+ elsif options['mode'].to_s == "all" |
|
| 148 |
+ return true |
|
| 149 |
+ elsif options['mode'].to_s == "on_change" |
|
| 150 |
+ old_events.each do |old_event| |
|
| 151 |
+ if old_event.payload.to_json == result.to_json |
|
| 152 |
+ old_event.expires_at = new_event_expiration_date |
|
| 153 |
+ old_event.save |
|
| 154 |
+ return false |
|
| 155 |
+ end |
|
| 156 |
+ end |
|
| 157 |
+ return true |
|
| 158 |
+ end |
|
| 159 |
+ raise "Illegal options[mode]: " + options['mode'].to_s |
|
| 143 | 160 |
end |
| 144 | 161 |
|
| 145 | 162 |
def previous_payloads |
| 146 |
- events.order("id desc").limit(UNIQUENESS_LOOK_BACK).pluck(:payload).map(&:to_json) if options['mode'].to_s == "on_change"
|
|
| 163 |
+ look_back = options['uniqueness_look_back'] ? options['uniqueness_look_back'].to_i : UNIQUENESS_LOOK_BACK |
|
| 164 |
+ events.order("id desc").limit(look_back) if options['mode'].to_s == "on_change"
|
|
| 147 | 165 |
end |
| 148 | 166 |
|
| 149 | 167 |
def extract_full_json? |
@@ -15,15 +15,19 @@ describe Agents::WebsiteAgent do |
||
| 15 | 15 |
'title' => {'css' => "#comic img", 'attr' => "title"}
|
| 16 | 16 |
} |
| 17 | 17 |
} |
| 18 |
- @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site) |
|
| 18 |
+ @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site, :keep_events_for => 2) |
|
| 19 | 19 |
@checker.user = users(:bob) |
| 20 | 20 |
@checker.save! |
| 21 | 21 |
end |
| 22 | 22 |
|
| 23 | 23 |
describe "#check" do |
| 24 |
- it "should check for changes" do |
|
| 24 |
+ it "should check for changes (and update Event.expires_at)" do |
|
| 25 | 25 |
lambda { @checker.check }.should change { Event.count }.by(1)
|
| 26 |
+ event = Event.last |
|
| 27 |
+ sleep 2 |
|
| 26 | 28 |
lambda { @checker.check }.should_not change { Event.count }
|
| 29 |
+ update_event = Event.last |
|
| 30 |
+ update_event.expires_at.should_not == event.expires_at |
|
| 27 | 31 |
end |
| 28 | 32 |
|
| 29 | 33 |
it "should always save events when in :all mode" do |
@@ -35,6 +39,30 @@ describe Agents::WebsiteAgent do |
||
| 35 | 39 |
}.should change { Event.count }.by(2)
|
| 36 | 40 |
end |
| 37 | 41 |
|
| 42 |
+ it "should take uniqueness_look_back into account during deduplication" do |
|
| 43 |
+ @site['mode'] = 'all' |
|
| 44 |
+ @checker.options = @site |
|
| 45 |
+ @checker.check |
|
| 46 |
+ @checker.check |
|
| 47 |
+ event = Event.last |
|
| 48 |
+ event.payload = "{}"
|
|
| 49 |
+ event.save |
|
| 50 |
+ |
|
| 51 |
+ lambda {
|
|
| 52 |
+ @site['mode'] = 'on_change' |
|
| 53 |
+ @site['uniqueness_look_back'] = 2 |
|
| 54 |
+ @checker.options = @site |
|
| 55 |
+ @checker.check |
|
| 56 |
+ }.should_not change { Event.count }
|
|
| 57 |
+ |
|
| 58 |
+ lambda {
|
|
| 59 |
+ @site['mode'] = 'on_change' |
|
| 60 |
+ @site['uniqueness_look_back'] = 1 |
|
| 61 |
+ @checker.options = @site |
|
| 62 |
+ @checker.check |
|
| 63 |
+ }.should change { Event.count }.by(1)
|
|
| 64 |
+ end |
|
| 65 |
+ |
|
| 38 | 66 |
it "should log an error if the number of results for a set of extraction patterns differs" do |
| 39 | 67 |
@site['extract']['url']['css'] = "div" |
| 40 | 68 |
@checker.options = @site |
@@ -217,4 +245,4 @@ describe Agents::WebsiteAgent do |
||
| 217 | 245 |
end |
| 218 | 246 |
end |
| 219 | 247 |
|
| 220 |
-end |
|
| 248 |
+end |