| @@ -18,14 +18,13 @@ module Agents | ||
| 18 | 18 |  | 
| 19 | 19 | Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all`, `on_change`, or `merge` (if fetching based on an Event, see below). | 
| 20 | 20 |  | 
| 21 | - `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape) | |
| 21 | + The `url` option can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape). | |
| 22 | 22 |  | 
| 23 | 23 | The WebsiteAgent can also scrape based on incoming events. | 
| 24 | 24 |  | 
| 25 | - * If the Event contains a `url` key, that URL will be fetched. | |
| 26 | - * For more control, you can set the `url_from_event` option and it will be used as a Liquid template to generate the url to access based on the Event. | |
| 27 | -      * If you set `data_from_event` to a Liquid template, it will be used to generate the data directly without fetching any URL.  (For example, set it to `{{ html }}` to use HTML contained in the `html` key of the incoming Event.) | |
| 28 | - * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with the new values. | |
| 25 | +      * Set the `url_from_event` option to a Liquid template to generate the url to access based on the Event.  (To fetch the url in the Event's `url` key, for example, set `url_from_event` to `{{ url }}`.) | |
| 26 | +      * Alternatively, set `data_from_event` to a Liquid template to use data directly without fetching any URL.  (For example, set it to `{{ html }}` to use HTML contained in the `html` key of the incoming Event.) | |
| 27 | + * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with new values. | |
| 29 | 28 |  | 
| 30 | 29 | # Supported Document Types | 
| 31 | 30 |  | 
| @@ -343,7 +342,7 @@ module Agents | ||
| 343 | 342 | if url_template = options['url_from_event'].presence | 
| 344 | 343 | interpolate_options(url_template) | 
| 345 | 344 | else | 
| 346 | - event.payload['url'].presence || interpolated['url'] | |
| 345 | + interpolated['url'] | |
| 347 | 346 | end | 
| 348 | 347 | check_urls(url_to_scrape, existing_payload) | 
| 349 | 348 | end | 
| @@ -0,0 +1,22 @@ | ||
| 1 | +class WebsiteAgentDoesNotUseEventUrl < ActiveRecord::Migration | |
| 2 | + def up | |
| 3 | + # Until this migration, if a WebsiteAgent received Events and did not have a `url_from_event` option set, | |
| 4 | + # it would use the `url` from the Event's payload. If the Event did not have a `url` in its payload, the | |
| 5 | + # WebsiteAgent would do nothing. This migration assumes that if someone has wired a WebsiteAgent to receive Events | |
| 6 | + # and has not set `url_from_event` or `data_from_event`, they were trying to use the Event's `url` payload, so we | |
| 7 | +    # set `url_from_event` to `{{ url }}` for them. | |
| 8 | + Agents::WebsiteAgent.find_each do |agent| | |
| 9 | + next unless agent.sources.count > 0 | |
| 10 | + | |
| 11 | + if !agent.options['data_from_event'].present? && !agent.options['url_from_event'].present? | |
| 12 | +        agent.options['url_from_event'] = '{{ url }}' | |
| 13 | + agent.save! | |
| 14 | +        puts ">> Setting `url_from_event` on WebsiteAgent##{agent.id} to {{ url }} because it is wired" | |
| 15 | + puts ">> to receive Events, and the WebsiteAgent no longer uses the Event's `url` value directly." | |
| 16 | + end | |
| 17 | + end | |
| 18 | + end | |
| 19 | + | |
| 20 | + def down | |
| 21 | + end | |
| 22 | +end | 
| @@ -768,20 +768,13 @@ fire: hot | ||
| 768 | 768 | @event = Event.new | 
| 769 | 769 | @event.agent = agents(:bob_rain_notifier_agent) | 
| 770 | 770 |            @event.payload = { | 
| 771 | - 'url' => 'http://xkcd.com', | |
| 771 | + 'url' => 'http://foo.com', | |
| 772 | 772 | 'link' => 'Random' | 
| 773 | 773 | } | 
| 774 | 774 | end | 
| 775 | 775 |  | 
| 776 | - it "should scrape from the url element in incoming event payload" do | |
| 777 | -          expect { | |
| 778 | - @checker.options = @valid_options | |
| 779 | - @checker.receive([@event]) | |
| 780 | -          }.to change { Event.count }.by(1) | |
| 781 | - end | |
| 782 | - | |
| 783 | - it "should use url_from_event as url to scrape if it exists when receiving an event" do | |
| 784 | - stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') | |
| 776 | + it "should use url_from_event as the url to scrape" do | |
| 777 | + stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Ffoo.com') | |
| 785 | 778 |  | 
| 786 | 779 | @checker.options = @valid_options.merge( | 
| 787 | 780 |              'url_from_event' => 'http://example.org/?url={{url | uri_escape}}' | 
| @@ -791,9 +784,16 @@ fire: hot | ||
| 791 | 784 | expect(stub).to have_been_requested | 
| 792 | 785 | end | 
| 793 | 786 |  | 
| 787 | + it "should use the Agent's `url` option if url_from_event is not set" do | |
| 788 | +          expect { | |
| 789 | + @checker.options = @valid_options | |
| 790 | + @checker.receive([@event]) | |
| 791 | +          }.to change { Event.count }.by(1) | |
| 792 | + end | |
| 793 | + | |
| 794 | 794 | it "should allow url_from_event to be an array of urls" do | 
| 795 | - stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') | |
| 796 | - stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com') | |
| 795 | + stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Ffoo.com') | |
| 796 | + stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Ffoo.com') | |
| 797 | 797 |  | 
| 798 | 798 | @checker.options = @valid_options.merge( | 
| 799 | 799 |              'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}'] | 
| @@ -805,7 +805,10 @@ fire: hot | ||
| 805 | 805 | end | 
| 806 | 806 |  | 
| 807 | 807 | it "should interpolate values from incoming event payload" do | 
| 808 | +          stub_request(:any, /foo/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/xkcd.html")), status: 200) | |
| 809 | + | |
| 808 | 810 |            expect { | 
| 811 | +            @valid_options['url_from_event'] = '{{ url }}' | |
| 809 | 812 |              @valid_options['extract'] = { | 
| 810 | 813 |                'from' => { | 
| 811 | 814 | 'xpath' => '*[1]', | 
| @@ -821,7 +824,7 @@ fire: hot | ||
| 821 | 824 |            }.to change { Event.count }.by(1) | 
| 822 | 825 |  | 
| 823 | 826 |            expect(Event.last.payload).to eq({ | 
| 824 | - 'from' => 'http://xkcd.com', | |
| 827 | + 'from' => 'http://foo.com', | |
| 825 | 828 | 'to' => 'http://dynamic.xkcd.com/random/comic/', | 
| 826 | 829 | }) | 
| 827 | 830 | end | 
| @@ -1075,7 +1078,6 @@ fire: hot | ||
| 1075 | 1078 | event = @events[6] | 
| 1076 | 1079 |          expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8") | 
| 1077 | 1080 | end | 
| 1078 | - | |
| 1079 | 1081 | end | 
| 1080 | 1082 | end | 
| 1081 | 1083 | end |