@@ -20,7 +20,12 @@ module Agents |
||
| 20 | 20 |
|
| 21 | 21 |
`url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape) |
| 22 | 22 |
|
| 23 |
- The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload, or if you set `url_from_event` it is used as a Liquid template to generate the url to access. If you specify `merge` as the `mode`, it will retain the old payload and update it with the new values. |
|
| 23 |
+ The WebsiteAgent can also scrape based on incoming events. |
|
| 24 |
+ |
|
| 25 |
+ * If the Event contains a `url` key, that URL will be fetched. |
|
| 26 |
+ * For more control, you can set the `url_from_event` option and it will be used as a Liquid template to generate the url to access based on the Event. |
|
| 27 |
+ * If you set `event_data_path` to the [JSONPath](http://goessner.net/articles/JsonPath/) of content in the Event, that will be used directly without fetching any URL. |
|
| 28 |
+ * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with the new values. |
|
| 24 | 29 |
|
| 25 | 30 |
# Supported Document Types |
| 26 | 31 |
|
@@ -140,7 +145,7 @@ module Agents |
||
| 140 | 145 |
|
| 141 | 146 |
def validate_options |
| 142 | 147 |
# Check for required fields |
| 143 |
- errors.add(:base, "either url or url_from_event is required") unless options['url'].present? || options['url_from_event'].present? |
|
| 148 |
+ errors.add(:base, "either url, url_from_event, or event_data_path are required") unless options['url'].present? || options['url_from_event'].present? || options['event_data_path'].present? |
|
| 144 | 149 |
errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present? |
| 145 | 150 |
validate_extract_options! |
| 146 | 151 |
|
@@ -251,15 +256,15 @@ module Agents |
||
| 251 | 256 |
check_urls(interpolated['url']) |
| 252 | 257 |
end |
| 253 | 258 |
|
| 254 |
- def check_urls(in_url, payload = {})
|
|
| 259 |
+ def check_urls(in_url, existing_payload = {})
|
|
| 255 | 260 |
return unless in_url.present? |
| 256 | 261 |
|
| 257 | 262 |
Array(in_url).each do |url| |
| 258 |
- check_url(url, payload) |
|
| 263 |
+ check_url(url, existing_payload) |
|
| 259 | 264 |
end |
| 260 | 265 |
end |
| 261 | 266 |
|
| 262 |
- def check_url(url, payload = {})
|
|
| 267 |
+ def check_url(url, existing_payload = {})
|
|
| 263 | 268 |
unless /\Ahttps?:\/\//i === url |
| 264 | 269 |
error "Ignoring a non-HTTP url: #{url.inspect}"
|
| 265 | 270 |
return |
@@ -271,70 +276,91 @@ module Agents |
||
| 271 | 276 |
|
| 272 | 277 |
interpolation_context.stack {
|
| 273 | 278 |
interpolation_context['_response_'] = ResponseDrop.new(response) |
| 274 |
- body = response.body |
|
| 275 |
- doc = parse(body) |
|
| 279 |
+ handle_data(response.body, response.env[:url], existing_payload) |
|
| 280 |
+ } |
|
| 281 |
+ rescue => e |
|
| 282 |
+ error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}"
|
|
| 283 |
+ end |
|
| 276 | 284 |
|
| 277 |
- if extract_full_json? |
|
| 278 |
- if store_payload!(previous_payloads(1), doc) |
|
| 279 |
- log "Storing new result for '#{name}': #{doc.inspect}"
|
|
| 280 |
- create_event payload: payload.merge(doc) |
|
| 281 |
- end |
|
| 282 |
- return |
|
| 285 |
+ def handle_data(body, url, existing_payload) |
|
| 286 |
+ doc = parse(body) |
|
| 287 |
+ |
|
| 288 |
+ if extract_full_json? |
|
| 289 |
+ if store_payload!(previous_payloads(1), doc) |
|
| 290 |
+ log "Storing new result for '#{name}': #{doc.inspect}"
|
|
| 291 |
+ create_event payload: existing_payload.merge(doc) |
|
| 283 | 292 |
end |
| 293 |
+ return |
|
| 294 |
+ end |
|
| 284 | 295 |
|
| 285 |
- output = |
|
| 286 |
- case extraction_type |
|
| 296 |
+ output = |
|
| 297 |
+ case extraction_type |
|
| 287 | 298 |
when 'json' |
| 288 | 299 |
extract_json(doc) |
| 289 | 300 |
when 'text' |
| 290 | 301 |
extract_text(doc) |
| 291 | 302 |
else |
| 292 | 303 |
extract_xml(doc) |
| 293 |
- end |
|
| 304 |
+ end |
|
| 294 | 305 |
|
| 295 |
- num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
|
|
| 306 |
+ num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
|
|
| 296 | 307 |
|
| 297 |
- if num_unique_lengths.length != 1 |
|
| 298 |
- raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
|
|
| 299 |
- end |
|
| 308 |
+ if num_unique_lengths.length != 1 |
|
| 309 |
+ raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
|
|
| 310 |
+ end |
|
| 300 | 311 |
|
| 301 |
- old_events = previous_payloads num_unique_lengths.first |
|
| 302 |
- num_unique_lengths.first.times do |index| |
|
| 303 |
- result = {}
|
|
| 304 |
- interpolated['extract'].keys.each do |name| |
|
| 305 |
- result[name] = output[name][index] |
|
| 306 |
- if name.to_s == 'url' |
|
| 307 |
- result[name] = (response.env[:url] + Utils.normalize_uri(result[name])).to_s |
|
| 308 |
- end |
|
| 312 |
+ old_events = previous_payloads num_unique_lengths.first |
|
| 313 |
+ num_unique_lengths.first.times do |index| |
|
| 314 |
+ result = {}
|
|
| 315 |
+ interpolated['extract'].keys.each do |name| |
|
| 316 |
+ result[name] = output[name][index] |
|
| 317 |
+ if name.to_s == 'url' && url.present? |
|
| 318 |
+ result[name] = (url + Utils.normalize_uri(result[name])).to_s |
|
| 309 | 319 |
end |
| 320 |
+ end |
|
| 310 | 321 |
|
| 311 |
- if store_payload!(old_events, result) |
|
| 312 |
- log "Storing new parsed result for '#{name}': #{result.inspect}"
|
|
| 313 |
- create_event payload: payload.merge(result) |
|
| 314 |
- end |
|
| 322 |
+ if store_payload!(old_events, result) |
|
| 323 |
+ log "Storing new parsed result for '#{name}': #{result.inspect}"
|
|
| 324 |
+ create_event payload: existing_payload.merge(result) |
|
| 315 | 325 |
end |
| 316 |
- } |
|
| 317 |
- rescue => e |
|
| 318 |
- error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}"
|
|
| 326 |
+ end |
|
| 319 | 327 |
end |
| 320 | 328 |
|
| 321 | 329 |
def receive(incoming_events) |
| 322 | 330 |
incoming_events.each do |event| |
| 323 | 331 |
interpolate_with(event) do |
| 324 |
- url_to_scrape = |
|
| 325 |
- if url_template = options['url_from_event'].presence |
|
| 326 |
- interpolate_options(url_template) |
|
| 332 |
+ existing_payload = interpolated['mode'].to_s == "merge" ? event.payload : {}
|
|
| 333 |
+ |
|
| 334 |
+ if event_data_path = options['event_data_path'].presence |
|
| 335 |
+ data = Utils.value_at(event.payload, interpolate_options(event_data_path)) |
|
| 336 |
+ if data.present? |
|
| 337 |
+ handle_event_data(data, event, existing_payload) |
|
| 327 | 338 |
else |
| 328 |
- event.payload['url'] |
|
| 339 |
+ error "No data was found in the Event payload at the JSONPath #{interpolate_options(event_data_path)}", inbound_event: event
|
|
| 329 | 340 |
end |
| 330 |
- check_urls(url_to_scrape, |
|
| 331 |
- interpolated['mode'].to_s == "merge" ? event.payload : {})
|
|
| 341 |
+ else |
|
| 342 |
+ url_to_scrape = |
|
| 343 |
+ if event_data_path = options['event_data_path'].presence |
|
| 344 |
+ interpolate_options(event_data_path) |
|
| 345 |
+ elsif url_template = options['url_from_event'].presence |
|
| 346 |
+ interpolate_options(url_template) |
|
| 347 |
+ else |
|
| 348 |
+ event.payload['url'] |
|
| 349 |
+ end |
|
| 350 |
+ check_urls(url_to_scrape, existing_payload) |
|
| 351 |
+ end |
|
| 332 | 352 |
end |
| 333 | 353 |
end |
| 334 | 354 |
end |
| 335 | 355 |
|
| 336 | 356 |
private |
| 337 | 357 |
|
| 358 |
+ def handle_event_data(data, event, existing_payload) |
|
| 359 |
+ handle_data(data, event.payload['url'], existing_payload) |
|
| 360 |
+ rescue => e |
|
| 361 |
+ error "Error when handling event data: #{e.message}\n#{e.backtrace.join("\n")}", inbound_event: event
|
|
| 362 |
+ end |
|
| 363 |
+ |
|
| 338 | 364 |
# This method returns true if the result should be stored as a new event. |
| 339 | 365 |
# If mode is set to 'on_change', this method may return false and update an existing |
| 340 | 366 |
# event to expire further in the future. |
@@ -763,92 +763,186 @@ fire: hot |
||
| 763 | 763 |
end |
| 764 | 764 |
|
| 765 | 765 |
describe "#receive" do |
| 766 |
- before do |
|
| 767 |
- @event = Event.new |
|
| 768 |
- @event.agent = agents(:bob_rain_notifier_agent) |
|
| 769 |
- @event.payload = {
|
|
| 770 |
- 'url' => 'http://xkcd.com', |
|
| 771 |
- 'link' => 'Random', |
|
| 772 |
- } |
|
| 773 |
- end |
|
| 766 |
+ describe "with a url or url_from_event" do |
|
| 767 |
+ before do |
|
| 768 |
+ @event = Event.new |
|
| 769 |
+ @event.agent = agents(:bob_rain_notifier_agent) |
|
| 770 |
+ @event.payload = {
|
|
| 771 |
+ 'url' => 'http://xkcd.com', |
|
| 772 |
+ 'link' => 'Random', |
|
| 773 |
+ } |
|
| 774 |
+ end |
|
| 774 | 775 |
|
| 775 |
- it "should scrape from the url element in incoming event payload" do |
|
| 776 |
- expect {
|
|
| 777 |
- @checker.options = @valid_options |
|
| 776 |
+ it "should scrape from the url element in incoming event payload" do |
|
| 777 |
+ expect {
|
|
| 778 |
+ @checker.options = @valid_options |
|
| 779 |
+ @checker.receive([@event]) |
|
| 780 |
+ }.to change { Event.count }.by(1)
|
|
| 781 |
+ end |
|
| 782 |
+ |
|
| 783 |
+ it "should use url_from_event as url to scrape if it exists when receiving an event" do |
|
| 784 |
+ stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') |
|
| 785 |
+ |
|
| 786 |
+ @checker.options = @valid_options.merge( |
|
| 787 |
+ 'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
|
|
| 788 |
+ ) |
|
| 778 | 789 |
@checker.receive([@event]) |
| 779 |
- }.to change { Event.count }.by(1)
|
|
| 780 |
- end |
|
| 781 | 790 |
|
| 782 |
- it "should use url_from_event as url to scrape if it exists when receiving an event" do |
|
| 783 |
- stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') |
|
| 791 |
+ expect(stub).to have_been_requested |
|
| 792 |
+ end |
|
| 784 | 793 |
|
| 785 |
- @checker.options = @valid_options.merge( |
|
| 786 |
- 'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
|
|
| 787 |
- ) |
|
| 788 |
- @checker.receive([@event]) |
|
| 794 |
+ it "should allow url_from_event to be an array of urls" do |
|
| 795 |
+ stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') |
|
| 796 |
+ stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com') |
|
| 789 | 797 |
|
| 790 |
- expect(stub).to have_been_requested |
|
| 791 |
- end |
|
| 798 |
+ @checker.options = @valid_options.merge( |
|
| 799 |
+ 'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}']
|
|
| 800 |
+ ) |
|
| 801 |
+ @checker.receive([@event]) |
|
| 792 | 802 |
|
| 793 |
- it "should allow url_from_event to be an array of urls" do |
|
| 794 |
- stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') |
|
| 795 |
- stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com') |
|
| 803 |
+ expect(stub1).to have_been_requested |
|
| 804 |
+ expect(stub2).to have_been_requested |
|
| 805 |
+ end |
|
| 796 | 806 |
|
| 797 |
- @checker.options = @valid_options.merge( |
|
| 798 |
- 'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}']
|
|
| 799 |
- ) |
|
| 800 |
- @checker.receive([@event]) |
|
| 807 |
+ it "should interpolate values from incoming event payload" do |
|
| 808 |
+ expect {
|
|
| 809 |
+ @valid_options['extract'] = {
|
|
| 810 |
+ 'from' => {
|
|
| 811 |
+ 'xpath' => '*[1]', |
|
| 812 |
+ 'value' => '{{url | to_xpath}}'
|
|
| 813 |
+ }, |
|
| 814 |
+ 'to' => {
|
|
| 815 |
+ 'xpath' => '(//a[@href and text()={{link | to_xpath}}])[1]',
|
|
| 816 |
+ 'value' => '@href' |
|
| 817 |
+ }, |
|
| 818 |
+ } |
|
| 819 |
+ @checker.options = @valid_options |
|
| 820 |
+ @checker.receive([@event]) |
|
| 821 |
+ }.to change { Event.count }.by(1)
|
|
| 801 | 822 |
|
| 802 |
- expect(stub1).to have_been_requested |
|
| 803 |
- expect(stub2).to have_been_requested |
|
| 804 |
- end |
|
| 823 |
+ expect(Event.last.payload).to eq({
|
|
| 824 |
+ 'from' => 'http://xkcd.com', |
|
| 825 |
+ 'to' => 'http://dynamic.xkcd.com/random/comic/', |
|
| 826 |
+ }) |
|
| 827 |
+ end |
|
| 805 | 828 |
|
| 806 |
- it "should interpolate values from incoming event payload" do |
|
| 807 |
- expect {
|
|
| 808 |
- @valid_options['extract'] = {
|
|
| 809 |
- 'from' => {
|
|
| 810 |
- 'xpath' => '*[1]', |
|
| 811 |
- 'value' => '{{url | to_xpath}}'
|
|
| 812 |
- }, |
|
| 813 |
- 'to' => {
|
|
| 814 |
- 'xpath' => '(//a[@href and text()={{link | to_xpath}}])[1]',
|
|
| 815 |
- 'value' => '@href' |
|
| 816 |
- }, |
|
| 817 |
- } |
|
| 818 |
- @checker.options = @valid_options |
|
| 819 |
- @checker.receive([@event]) |
|
| 820 |
- }.to change { Event.count }.by(1)
|
|
| 829 |
+ it "should interpolate values from incoming event payload and _response_" do |
|
| 830 |
+ @event.payload['title'] = 'XKCD' |
|
| 821 | 831 |
|
| 822 |
- expect(Event.last.payload).to eq({
|
|
| 823 |
- 'from' => 'http://xkcd.com', |
|
| 824 |
- 'to' => 'http://dynamic.xkcd.com/random/comic/', |
|
| 825 |
- }) |
|
| 826 |
- end |
|
| 832 |
+ expect {
|
|
| 833 |
+ @valid_options['extract'] = {
|
|
| 834 |
+ 'response_info' => @valid_options['extract']['url'].merge( |
|
| 835 |
+ 'value' => '{% capture sentence %}The reponse from {{title}} was {{_response_.status}} {{_response_.headers.X-Status-Message}}.{% endcapture %}{{sentence | to_xpath}}'
|
|
| 836 |
+ ) |
|
| 837 |
+ } |
|
| 838 |
+ @checker.options = @valid_options |
|
| 839 |
+ @checker.receive([@event]) |
|
| 840 |
+ }.to change { Event.count }.by(1)
|
|
| 827 | 841 |
|
| 828 |
- it "should interpolate values from incoming event payload and _response_" do |
|
| 829 |
- @event.payload['title'] = 'XKCD' |
|
| 842 |
+ expect(Event.last.payload['response_info']).to eq('The reponse from XKCD was 200 OK.')
|
|
| 843 |
+ end |
|
| 830 | 844 |
|
| 831 |
- expect {
|
|
| 832 |
- @valid_options['extract'] = {
|
|
| 833 |
- 'response_info' => @valid_options['extract']['url'].merge( |
|
| 834 |
- 'value' => '{% capture sentence %}The reponse from {{title}} was {{_response_.status}} {{_response_.headers.X-Status-Message}}.{% endcapture %}{{sentence | to_xpath}}'
|
|
| 845 |
+ it "should support merging of events" do |
|
| 846 |
+ expect {
|
|
| 847 |
+ @checker.options = @valid_options |
|
| 848 |
+ @checker.options[:mode] = "merge" |
|
| 849 |
+ @checker.receive([@event]) |
|
| 850 |
+ }.to change { Event.count }.by(1)
|
|
| 851 |
+ last_payload = Event.last.payload |
|
| 852 |
+ expect(last_payload['link']).to eq('Random')
|
|
| 853 |
+ end |
|
| 854 |
+ end |
|
| 855 |
+ |
|
| 856 |
+ describe "with a event_data_path" do |
|
| 857 |
+ describe "with json data" do |
|
| 858 |
+ before do |
|
| 859 |
+ @event = Event.new |
|
| 860 |
+ @event.agent = agents(:bob_rain_notifier_agent) |
|
| 861 |
+ @event.payload = {
|
|
| 862 |
+ 'something' => 'some value', |
|
| 863 |
+ 'some_object' => {
|
|
| 864 |
+ 'some_data' => { hello: 'world' }.to_json
|
|
| 865 |
+ } |
|
| 866 |
+ } |
|
| 867 |
+ @event.save! |
|
| 868 |
+ |
|
| 869 |
+ @checker.options = @valid_options.merge( |
|
| 870 |
+ 'type' => 'json', |
|
| 871 |
+ 'event_data_path' => 'some_object.some_data', |
|
| 872 |
+ 'extract' => {
|
|
| 873 |
+ 'value' => { 'path' => 'hello' }
|
|
| 874 |
+ } |
|
| 875 |
+ ) |
|
| 876 |
+ end |
|
| 877 |
+ |
|
| 878 |
+ it "should extract from the event data in the incoming event payload" do |
|
| 879 |
+ expect {
|
|
| 880 |
+ @checker.receive([@event]) |
|
| 881 |
+ }.to change { Event.count }.by(1)
|
|
| 882 |
+ expect(@checker.events.last.payload).to eq({ 'value' => 'world' })
|
|
| 883 |
+ end |
|
| 884 |
+ |
|
| 885 |
+ it "should support merge mode" do |
|
| 886 |
+ @checker.options['mode'] = "merge" |
|
| 887 |
+ |
|
| 888 |
+ expect {
|
|
| 889 |
+ @checker.receive([@event]) |
|
| 890 |
+ }.to change { Event.count }.by(1)
|
|
| 891 |
+ expect(@checker.events.last.payload).to eq(@event.payload.merge('value' => 'world'))
|
|
| 892 |
+ end |
|
| 893 |
+ |
|
| 894 |
+ it "should output an error when nothing can be found at the path" do |
|
| 895 |
+ @checker.options = @checker.options.merge( |
|
| 896 |
+ 'event_data_path' => 'some_object.mistake' |
|
| 835 | 897 |
) |
| 836 |
- } |
|
| 837 |
- @checker.options = @valid_options |
|
| 838 |
- @checker.receive([@event]) |
|
| 839 |
- }.to change { Event.count }.by(1)
|
|
| 840 | 898 |
|
| 841 |
- expect(Event.last.payload['response_info']).to eq('The reponse from XKCD was 200 OK.')
|
|
| 842 |
- end |
|
| 899 |
+ expect {
|
|
| 900 |
+ @checker.receive([@event]) |
|
| 901 |
+ }.to_not change { Event.count }
|
|
| 843 | 902 |
|
| 844 |
- it "should support merging of events" do |
|
| 845 |
- expect {
|
|
| 846 |
- @checker.options = @valid_options |
|
| 847 |
- @checker.options[:mode] = "merge" |
|
| 848 |
- @checker.receive([@event]) |
|
| 849 |
- }.to change { Event.count }.by(1)
|
|
| 850 |
- last_payload = Event.last.payload |
|
| 851 |
- expect(last_payload['link']).to eq('Random')
|
|
| 903 |
+ expect(@checker.logs.last.message).to match(/No data was found in the Event payload at the JSONPath some_object.mistake/) |
|
| 904 |
+ end |
|
| 905 |
+ |
|
| 906 |
+ it "should output an error when the data cannot be parsed" do |
|
| 907 |
+ @event.update_attribute :payload, @event.payload.merge('some_object' => { 'some_data' => '{invalid json' })
|
|
| 908 |
+ |
|
| 909 |
+ expect {
|
|
| 910 |
+ @checker.receive([@event]) |
|
| 911 |
+ }.to_not change { Event.count }
|
|
| 912 |
+ |
|
| 913 |
+ expect(@checker.logs.last.message).to match(/Error when handling event data:/) |
|
| 914 |
+ end |
|
| 915 |
+ end |
|
| 916 |
+ |
|
| 917 |
+ describe "with HTML data" do |
|
| 918 |
+ before do |
|
| 919 |
+ @event = Event.new |
|
| 920 |
+ @event.agent = agents(:bob_rain_notifier_agent) |
|
| 921 |
+ @event.payload = {
|
|
| 922 |
+ 'url' => 'http://xkcd.com', |
|
| 923 |
+ 'some_object' => {
|
|
| 924 |
+ 'some_data' => "<div><span class='title'>Title!</span><span class='body'>Body!</span></div>" |
|
| 925 |
+ } |
|
| 926 |
+ } |
|
| 927 |
+ @event.save! |
|
| 928 |
+ |
|
| 929 |
+ @checker.options = @valid_options.merge( |
|
| 930 |
+ 'type' => 'html', |
|
| 931 |
+ 'event_data_path' => 'some_object.some_data', |
|
| 932 |
+ 'extract' => {
|
|
| 933 |
+ 'title' => { 'css' => ".title", 'value' => ".//text()" },
|
|
| 934 |
+ 'body' => { 'css' => "div span.body", 'value' => ".//text()" }
|
|
| 935 |
+ } |
|
| 936 |
+ ) |
|
| 937 |
+ end |
|
| 938 |
+ |
|
| 939 |
+ it "should extract from the event data in the incoming event payload" do |
|
| 940 |
+ expect {
|
|
| 941 |
+ @checker.receive([@event]) |
|
| 942 |
+ }.to change { Event.count }.by(1)
|
|
| 943 |
+ expect(@checker.events.last.payload).to eq({ 'title' => 'Title!', 'body' => 'Body!' })
|
|
| 944 |
+ end |
|
| 945 |
+ end |
|
| 852 | 946 |
end |
| 853 | 947 |
end |
| 854 | 948 |
end |