@@ -20,7 +20,12 @@ module Agents |
||
20 | 20 |
|
21 | 21 |
`url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape) |
22 | 22 |
|
23 |
- The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload, or if you set `url_from_event` it is used as a Liquid template to generate the url to access. If you specify `merge` as the `mode`, it will retain the old payload and update it with the new values. |
|
23 |
+ The WebsiteAgent can also scrape based on incoming events. |
|
24 |
+ |
|
25 |
+ * If the Event contains a `url` key, that URL will be fetched. |
|
26 |
+ * For more control, you can set the `url_from_event` option and it will be used as a Liquid template to generate the url to access based on the Event. |
|
27 |
+ * If you set `event_data_path` to the [JSONPath](http://goessner.net/articles/JsonPath/) of content in the Event, that will be used directly without fetching any URL. |
|
28 |
+ * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with the new values. |
|
24 | 29 |
|
25 | 30 |
# Supported Document Types |
26 | 31 |
|
@@ -140,7 +145,7 @@ module Agents |
||
140 | 145 |
|
141 | 146 |
def validate_options |
142 | 147 |
# Check for required fields |
143 |
- errors.add(:base, "either url or url_from_event is required") unless options['url'].present? || options['url_from_event'].present? |
|
148 |
+ errors.add(:base, "either url, url_from_event, or event_data_path are required") unless options['url'].present? || options['url_from_event'].present? || options['event_data_path'].present? |
|
144 | 149 |
errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present? |
145 | 150 |
validate_extract_options! |
146 | 151 |
|
@@ -251,15 +256,15 @@ module Agents |
||
251 | 256 |
check_urls(interpolated['url']) |
252 | 257 |
end |
253 | 258 |
|
254 |
- def check_urls(in_url, payload = {}) |
|
259 |
+ def check_urls(in_url, existing_payload = {}) |
|
255 | 260 |
return unless in_url.present? |
256 | 261 |
|
257 | 262 |
Array(in_url).each do |url| |
258 |
- check_url(url, payload) |
|
263 |
+ check_url(url, existing_payload) |
|
259 | 264 |
end |
260 | 265 |
end |
261 | 266 |
|
262 |
- def check_url(url, payload = {}) |
|
267 |
+ def check_url(url, existing_payload = {}) |
|
263 | 268 |
unless /\Ahttps?:\/\//i === url |
264 | 269 |
error "Ignoring a non-HTTP url: #{url.inspect}" |
265 | 270 |
return |
@@ -271,70 +276,91 @@ module Agents |
||
271 | 276 |
|
272 | 277 |
interpolation_context.stack { |
273 | 278 |
interpolation_context['_response_'] = ResponseDrop.new(response) |
274 |
- body = response.body |
|
275 |
- doc = parse(body) |
|
279 |
+ handle_data(response.body, response.env[:url], existing_payload) |
|
280 |
+ } |
|
281 |
+ rescue => e |
|
282 |
+ error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}" |
|
283 |
+ end |
|
276 | 284 |
|
277 |
- if extract_full_json? |
|
278 |
- if store_payload!(previous_payloads(1), doc) |
|
279 |
- log "Storing new result for '#{name}': #{doc.inspect}" |
|
280 |
- create_event payload: payload.merge(doc) |
|
281 |
- end |
|
282 |
- return |
|
285 |
+ def handle_data(body, url, existing_payload) |
|
286 |
+ doc = parse(body) |
|
287 |
+ |
|
288 |
+ if extract_full_json? |
|
289 |
+ if store_payload!(previous_payloads(1), doc) |
|
290 |
+ log "Storing new result for '#{name}': #{doc.inspect}" |
|
291 |
+ create_event payload: existing_payload.merge(doc) |
|
283 | 292 |
end |
293 |
+ return |
|
294 |
+ end |
|
284 | 295 |
|
285 |
- output = |
|
286 |
- case extraction_type |
|
296 |
+ output = |
|
297 |
+ case extraction_type |
|
287 | 298 |
when 'json' |
288 | 299 |
extract_json(doc) |
289 | 300 |
when 'text' |
290 | 301 |
extract_text(doc) |
291 | 302 |
else |
292 | 303 |
extract_xml(doc) |
293 |
- end |
|
304 |
+ end |
|
294 | 305 |
|
295 |
- num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq |
|
306 |
+ num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq |
|
296 | 307 |
|
297 |
- if num_unique_lengths.length != 1 |
|
298 |
- raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}" |
|
299 |
- end |
|
308 |
+ if num_unique_lengths.length != 1 |
|
309 |
+ raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}" |
|
310 |
+ end |
|
300 | 311 |
|
301 |
- old_events = previous_payloads num_unique_lengths.first |
|
302 |
- num_unique_lengths.first.times do |index| |
|
303 |
- result = {} |
|
304 |
- interpolated['extract'].keys.each do |name| |
|
305 |
- result[name] = output[name][index] |
|
306 |
- if name.to_s == 'url' |
|
307 |
- result[name] = (response.env[:url] + Utils.normalize_uri(result[name])).to_s |
|
308 |
- end |
|
312 |
+ old_events = previous_payloads num_unique_lengths.first |
|
313 |
+ num_unique_lengths.first.times do |index| |
|
314 |
+ result = {} |
|
315 |
+ interpolated['extract'].keys.each do |name| |
|
316 |
+ result[name] = output[name][index] |
|
317 |
+ if name.to_s == 'url' && url.present? |
|
318 |
+ result[name] = (url + Utils.normalize_uri(result[name])).to_s |
|
309 | 319 |
end |
320 |
+ end |
|
310 | 321 |
|
311 |
- if store_payload!(old_events, result) |
|
312 |
- log "Storing new parsed result for '#{name}': #{result.inspect}" |
|
313 |
- create_event payload: payload.merge(result) |
|
314 |
- end |
|
322 |
+ if store_payload!(old_events, result) |
|
323 |
+ log "Storing new parsed result for '#{name}': #{result.inspect}" |
|
324 |
+ create_event payload: existing_payload.merge(result) |
|
315 | 325 |
end |
316 |
- } |
|
317 |
- rescue => e |
|
318 |
- error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}" |
|
326 |
+ end |
|
319 | 327 |
end |
320 | 328 |
|
321 | 329 |
def receive(incoming_events) |
322 | 330 |
incoming_events.each do |event| |
323 | 331 |
interpolate_with(event) do |
324 |
- url_to_scrape = |
|
325 |
- if url_template = options['url_from_event'].presence |
|
326 |
- interpolate_options(url_template) |
|
332 |
+ existing_payload = interpolated['mode'].to_s == "merge" ? event.payload : {} |
|
333 |
+ |
|
334 |
+ if event_data_path = options['event_data_path'].presence |
|
335 |
+ data = Utils.value_at(event.payload, interpolate_options(event_data_path)) |
|
336 |
+ if data.present? |
|
337 |
+ handle_event_data(data, event, existing_payload) |
|
327 | 338 |
else |
328 |
- event.payload['url'] |
|
339 |
+ error "No data was found in the Event payload at the JSONPath #{interpolate_options(event_data_path)}", inbound_event: event |
|
329 | 340 |
end |
330 |
- check_urls(url_to_scrape, |
|
331 |
- interpolated['mode'].to_s == "merge" ? event.payload : {}) |
|
341 |
+ else |
|
342 |
+ url_to_scrape = |
|
343 |
+ if event_data_path = options['event_data_path'].presence |
|
344 |
+ interpolate_options(event_data_path) |
|
345 |
+ elsif url_template = options['url_from_event'].presence |
|
346 |
+ interpolate_options(url_template) |
|
347 |
+ else |
|
348 |
+ event.payload['url'] |
|
349 |
+ end |
|
350 |
+ check_urls(url_to_scrape, existing_payload) |
|
351 |
+ end |
|
332 | 352 |
end |
333 | 353 |
end |
334 | 354 |
end |
335 | 355 |
|
336 | 356 |
private |
337 | 357 |
|
358 |
+ def handle_event_data(data, event, existing_payload) |
|
359 |
+ handle_data(data, event.payload['url'], existing_payload) |
|
360 |
+ rescue => e |
|
361 |
+ error "Error when handling event data: #{e.message}\n#{e.backtrace.join("\n")}", inbound_event: event |
|
362 |
+ end |
|
363 |
+ |
|
338 | 364 |
# This method returns true if the result should be stored as a new event. |
339 | 365 |
# If mode is set to 'on_change', this method may return false and update an existing |
340 | 366 |
# event to expire further in the future. |
@@ -763,92 +763,186 @@ fire: hot |
||
763 | 763 |
end |
764 | 764 |
|
765 | 765 |
describe "#receive" do |
766 |
- before do |
|
767 |
- @event = Event.new |
|
768 |
- @event.agent = agents(:bob_rain_notifier_agent) |
|
769 |
- @event.payload = { |
|
770 |
- 'url' => 'http://xkcd.com', |
|
771 |
- 'link' => 'Random', |
|
772 |
- } |
|
773 |
- end |
|
766 |
+ describe "with a url or url_from_event" do |
|
767 |
+ before do |
|
768 |
+ @event = Event.new |
|
769 |
+ @event.agent = agents(:bob_rain_notifier_agent) |
|
770 |
+ @event.payload = { |
|
771 |
+ 'url' => 'http://xkcd.com', |
|
772 |
+ 'link' => 'Random', |
|
773 |
+ } |
|
774 |
+ end |
|
774 | 775 |
|
775 |
- it "should scrape from the url element in incoming event payload" do |
|
776 |
- expect { |
|
777 |
- @checker.options = @valid_options |
|
776 |
+ it "should scrape from the url element in incoming event payload" do |
|
777 |
+ expect { |
|
778 |
+ @checker.options = @valid_options |
|
779 |
+ @checker.receive([@event]) |
|
780 |
+ }.to change { Event.count }.by(1) |
|
781 |
+ end |
|
782 |
+ |
|
783 |
+ it "should use url_from_event as url to scrape if it exists when receiving an event" do |
|
784 |
+ stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') |
|
785 |
+ |
|
786 |
+ @checker.options = @valid_options.merge( |
|
787 |
+ 'url_from_event' => 'http://example.org/?url={{url | uri_escape}}' |
|
788 |
+ ) |
|
778 | 789 |
@checker.receive([@event]) |
779 |
- }.to change { Event.count }.by(1) |
|
780 |
- end |
|
781 | 790 |
|
782 |
- it "should use url_from_event as url to scrape if it exists when receiving an event" do |
|
783 |
- stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') |
|
791 |
+ expect(stub).to have_been_requested |
|
792 |
+ end |
|
784 | 793 |
|
785 |
- @checker.options = @valid_options.merge( |
|
786 |
- 'url_from_event' => 'http://example.org/?url={{url | uri_escape}}' |
|
787 |
- ) |
|
788 |
- @checker.receive([@event]) |
|
794 |
+ it "should allow url_from_event to be an array of urls" do |
|
795 |
+ stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') |
|
796 |
+ stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com') |
|
789 | 797 |
|
790 |
- expect(stub).to have_been_requested |
|
791 |
- end |
|
798 |
+ @checker.options = @valid_options.merge( |
|
799 |
+ 'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}'] |
|
800 |
+ ) |
|
801 |
+ @checker.receive([@event]) |
|
792 | 802 |
|
793 |
- it "should allow url_from_event to be an array of urls" do |
|
794 |
- stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') |
|
795 |
- stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com') |
|
803 |
+ expect(stub1).to have_been_requested |
|
804 |
+ expect(stub2).to have_been_requested |
|
805 |
+ end |
|
796 | 806 |
|
797 |
- @checker.options = @valid_options.merge( |
|
798 |
- 'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}'] |
|
799 |
- ) |
|
800 |
- @checker.receive([@event]) |
|
807 |
+ it "should interpolate values from incoming event payload" do |
|
808 |
+ expect { |
|
809 |
+ @valid_options['extract'] = { |
|
810 |
+ 'from' => { |
|
811 |
+ 'xpath' => '*[1]', |
|
812 |
+ 'value' => '{{url | to_xpath}}' |
|
813 |
+ }, |
|
814 |
+ 'to' => { |
|
815 |
+ 'xpath' => '(//a[@href and text()={{link | to_xpath}}])[1]', |
|
816 |
+ 'value' => '@href' |
|
817 |
+ }, |
|
818 |
+ } |
|
819 |
+ @checker.options = @valid_options |
|
820 |
+ @checker.receive([@event]) |
|
821 |
+ }.to change { Event.count }.by(1) |
|
801 | 822 |
|
802 |
- expect(stub1).to have_been_requested |
|
803 |
- expect(stub2).to have_been_requested |
|
804 |
- end |
|
823 |
+ expect(Event.last.payload).to eq({ |
|
824 |
+ 'from' => 'http://xkcd.com', |
|
825 |
+ 'to' => 'http://dynamic.xkcd.com/random/comic/', |
|
826 |
+ }) |
|
827 |
+ end |
|
805 | 828 |
|
806 |
- it "should interpolate values from incoming event payload" do |
|
807 |
- expect { |
|
808 |
- @valid_options['extract'] = { |
|
809 |
- 'from' => { |
|
810 |
- 'xpath' => '*[1]', |
|
811 |
- 'value' => '{{url | to_xpath}}' |
|
812 |
- }, |
|
813 |
- 'to' => { |
|
814 |
- 'xpath' => '(//a[@href and text()={{link | to_xpath}}])[1]', |
|
815 |
- 'value' => '@href' |
|
816 |
- }, |
|
817 |
- } |
|
818 |
- @checker.options = @valid_options |
|
819 |
- @checker.receive([@event]) |
|
820 |
- }.to change { Event.count }.by(1) |
|
829 |
+ it "should interpolate values from incoming event payload and _response_" do |
|
830 |
+ @event.payload['title'] = 'XKCD' |
|
821 | 831 |
|
822 |
- expect(Event.last.payload).to eq({ |
|
823 |
- 'from' => 'http://xkcd.com', |
|
824 |
- 'to' => 'http://dynamic.xkcd.com/random/comic/', |
|
825 |
- }) |
|
826 |
- end |
|
832 |
+ expect { |
|
833 |
+ @valid_options['extract'] = { |
|
834 |
+ 'response_info' => @valid_options['extract']['url'].merge( |
|
835 |
+ 'value' => '{% capture sentence %}The reponse from {{title}} was {{_response_.status}} {{_response_.headers.X-Status-Message}}.{% endcapture %}{{sentence | to_xpath}}' |
|
836 |
+ ) |
|
837 |
+ } |
|
838 |
+ @checker.options = @valid_options |
|
839 |
+ @checker.receive([@event]) |
|
840 |
+ }.to change { Event.count }.by(1) |
|
827 | 841 |
|
828 |
- it "should interpolate values from incoming event payload and _response_" do |
|
829 |
- @event.payload['title'] = 'XKCD' |
|
842 |
+ expect(Event.last.payload['response_info']).to eq('The reponse from XKCD was 200 OK.') |
|
843 |
+ end |
|
830 | 844 |
|
831 |
- expect { |
|
832 |
- @valid_options['extract'] = { |
|
833 |
- 'response_info' => @valid_options['extract']['url'].merge( |
|
834 |
- 'value' => '{% capture sentence %}The reponse from {{title}} was {{_response_.status}} {{_response_.headers.X-Status-Message}}.{% endcapture %}{{sentence | to_xpath}}' |
|
845 |
+ it "should support merging of events" do |
|
846 |
+ expect { |
|
847 |
+ @checker.options = @valid_options |
|
848 |
+ @checker.options[:mode] = "merge" |
|
849 |
+ @checker.receive([@event]) |
|
850 |
+ }.to change { Event.count }.by(1) |
|
851 |
+ last_payload = Event.last.payload |
|
852 |
+ expect(last_payload['link']).to eq('Random') |
|
853 |
+ end |
|
854 |
+ end |
|
855 |
+ |
|
856 |
+ describe "with a event_data_path" do |
|
857 |
+ describe "with json data" do |
|
858 |
+ before do |
|
859 |
+ @event = Event.new |
|
860 |
+ @event.agent = agents(:bob_rain_notifier_agent) |
|
861 |
+ @event.payload = { |
|
862 |
+ 'something' => 'some value', |
|
863 |
+ 'some_object' => { |
|
864 |
+ 'some_data' => { hello: 'world' }.to_json |
|
865 |
+ } |
|
866 |
+ } |
|
867 |
+ @event.save! |
|
868 |
+ |
|
869 |
+ @checker.options = @valid_options.merge( |
|
870 |
+ 'type' => 'json', |
|
871 |
+ 'event_data_path' => 'some_object.some_data', |
|
872 |
+ 'extract' => { |
|
873 |
+ 'value' => { 'path' => 'hello' } |
|
874 |
+ } |
|
875 |
+ ) |
|
876 |
+ end |
|
877 |
+ |
|
878 |
+ it "should extract from the event data in the incoming event payload" do |
|
879 |
+ expect { |
|
880 |
+ @checker.receive([@event]) |
|
881 |
+ }.to change { Event.count }.by(1) |
|
882 |
+ expect(@checker.events.last.payload).to eq({ 'value' => 'world' }) |
|
883 |
+ end |
|
884 |
+ |
|
885 |
+ it "should support merge mode" do |
|
886 |
+ @checker.options['mode'] = "merge" |
|
887 |
+ |
|
888 |
+ expect { |
|
889 |
+ @checker.receive([@event]) |
|
890 |
+ }.to change { Event.count }.by(1) |
|
891 |
+ expect(@checker.events.last.payload).to eq(@event.payload.merge('value' => 'world')) |
|
892 |
+ end |
|
893 |
+ |
|
894 |
+ it "should output an error when nothing can be found at the path" do |
|
895 |
+ @checker.options = @checker.options.merge( |
|
896 |
+ 'event_data_path' => 'some_object.mistake' |
|
835 | 897 |
) |
836 |
- } |
|
837 |
- @checker.options = @valid_options |
|
838 |
- @checker.receive([@event]) |
|
839 |
- }.to change { Event.count }.by(1) |
|
840 | 898 |
|
841 |
- expect(Event.last.payload['response_info']).to eq('The reponse from XKCD was 200 OK.') |
|
842 |
- end |
|
899 |
+ expect { |
|
900 |
+ @checker.receive([@event]) |
|
901 |
+ }.to_not change { Event.count } |
|
843 | 902 |
|
844 |
- it "should support merging of events" do |
|
845 |
- expect { |
|
846 |
- @checker.options = @valid_options |
|
847 |
- @checker.options[:mode] = "merge" |
|
848 |
- @checker.receive([@event]) |
|
849 |
- }.to change { Event.count }.by(1) |
|
850 |
- last_payload = Event.last.payload |
|
851 |
- expect(last_payload['link']).to eq('Random') |
|
903 |
+ expect(@checker.logs.last.message).to match(/No data was found in the Event payload at the JSONPath some_object.mistake/) |
|
904 |
+ end |
|
905 |
+ |
|
906 |
+ it "should output an error when the data cannot be parsed" do |
|
907 |
+ @event.update_attribute :payload, @event.payload.merge('some_object' => { 'some_data' => '{invalid json' }) |
|
908 |
+ |
|
909 |
+ expect { |
|
910 |
+ @checker.receive([@event]) |
|
911 |
+ }.to_not change { Event.count } |
|
912 |
+ |
|
913 |
+ expect(@checker.logs.last.message).to match(/Error when handling event data:/) |
|
914 |
+ end |
|
915 |
+ end |
|
916 |
+ |
|
917 |
+ describe "with HTML data" do |
|
918 |
+ before do |
|
919 |
+ @event = Event.new |
|
920 |
+ @event.agent = agents(:bob_rain_notifier_agent) |
|
921 |
+ @event.payload = { |
|
922 |
+ 'url' => 'http://xkcd.com', |
|
923 |
+ 'some_object' => { |
|
924 |
+ 'some_data' => "<div><span class='title'>Title!</span><span class='body'>Body!</span></div>" |
|
925 |
+ } |
|
926 |
+ } |
|
927 |
+ @event.save! |
|
928 |
+ |
|
929 |
+ @checker.options = @valid_options.merge( |
|
930 |
+ 'type' => 'html', |
|
931 |
+ 'event_data_path' => 'some_object.some_data', |
|
932 |
+ 'extract' => { |
|
933 |
+ 'title' => { 'css' => ".title", 'value' => ".//text()" }, |
|
934 |
+ 'body' => { 'css' => "div span.body", 'value' => ".//text()" } |
|
935 |
+ } |
|
936 |
+ ) |
|
937 |
+ end |
|
938 |
+ |
|
939 |
+ it "should extract from the event data in the incoming event payload" do |
|
940 |
+ expect { |
|
941 |
+ @checker.receive([@event]) |
|
942 |
+ }.to change { Event.count }.by(1) |
|
943 |
+ expect(@checker.events.last.payload).to eq({ 'title' => 'Title!', 'body' => 'Body!' }) |
|
944 |
+ end |
|
945 |
+ end |
|
852 | 946 |
end |
853 | 947 |
end |
854 | 948 |
end |