10 gadi atpakaļ · 3469ed2a5e
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -18,14 +18,13 @@ module Agents
 
                 
              
 
                       Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all`, `on_change`, or `merge` (if fetching based on an Event, see below).
              
 
                 
              
 
                -      `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
              
 
                +      The `url` option can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape).
              
 
                 
              
 
                       The WebsiteAgent can also scrape based on incoming events.
              
 
                 
              
 
                -      * If the Event contains a `url` key, that URL will be fetched.
              
 
                -      * For more control, you can set the `url_from_event` option and it will be used as a Liquid template to generate the url to access based on the Event.
              
 
                -      * If you set `data_from_event` to a Liquid template, it will be used to generate the data directly without fetching any URL.  (For example, set it to `{{ html }}` to use HTML contained in the `html` key of the incoming Event.)
              
 
                -      * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with the new values.
              
 
                +      * Set the `url_from_event` option to a Liquid template to generate the url to access based on the Event.  (To fetch the url in the Event's `url` key, for example, set `url_from_event` to `{{ url }}`.)
              
 
                +      * Alternatively, set `data_from_event` to a Liquid template to use data directly without fetching any URL.  (For example, set it to `{{ html }}` to use HTML contained in the `html` key of the incoming Event.)
              
 
                +      * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with new values.
              
 
                 
              
 
                       # Supported Document Types
              
 
                 
              
@@ -343,7 +342,7 @@ module Agents
 
                               if url_template = options['url_from_event'].presence
              
 
                                 interpolate_options(url_template)
              
 
                               else
              
 
                -                event.payload['url'].presence || interpolated['url']
              
 
                +                interpolated['url']
              
 
                               end
              
 
                             check_urls(url_to_scrape, existing_payload)
              
 
                           end
              
--- a/db/migrate/20160108221620_website_agent_does_not_use_event_url.rb
+++ b/db/migrate/20160108221620_website_agent_does_not_use_event_url.rb
@@ -0,0 +1,22 @@
 
                +class WebsiteAgentDoesNotUseEventUrl < ActiveRecord::Migration
              
 
                +  def up
              
 
                +    # Until this migration, if a WebsiteAgent received Events and did not have a `url_from_event` option set,
              
 
                +    # it would use the `url` from the Event's payload.  If the Event did not have a `url` in its payload, the
              
 
                +    # WebsiteAgent would do nothing. This migration assumes that if someone has wired a WebsiteAgent to receive Events
              
 
                +    # and has not set `url_from_event` or `data_from_event`, they were trying to use the Event's `url` payload, so we
              
 
                +    # set `url_from_event` to `{{ url }}` for them.
              
 
                +    Agents::WebsiteAgent.find_each do |agent|
              
 
                +      next unless agent.sources.count > 0
              
 
                +
              
 
                +      if !agent.options['data_from_event'].present? && !agent.options['url_from_event'].present?
              
 
                +        agent.options['url_from_event'] = '{{ url }}'
              
 
                +        agent.save!
              
 
                +        puts ">> Setting `url_from_event` on WebsiteAgent##{agent.id} to {{ url }} because it is wired"
              
 
                +        puts ">> to receive Events, and the WebsiteAgent no longer uses the Event's `url` value directly."
              
 
                +      end
              
 
                +    end
              
 
                +  end
              
 
                +
              
 
                +  def down
              
 
                +  end
              
 
                +end
              
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -768,20 +768,13 @@ fire: hot
 
                           @event = Event.new
              
 
                           @event.agent = agents(:bob_rain_notifier_agent)
              
 
                           @event.payload = {
              
 
                -            'url' => 'http://xkcd.com',
              
 
                +            'url' => 'http://foo.com',
              
 
                             'link' => 'Random'
              
 
                           }
              
 
                         end
              
 
                 
              
 
                -        it "should scrape from the url element in incoming event payload" do
              
 
                -          expect {
              
 
                -            @checker.options = @valid_options
              
 
                -            @checker.receive([@event])
              
 
                -          }.to change { Event.count }.by(1)
              
 
                -        end
              
 
                -
              
 
                -        it "should use url_from_event as url to scrape if it exists when receiving an event" do
              
 
                -          stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
              
 
                +        it "should use url_from_event as the url to scrape" do
              
 
                +          stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Ffoo.com')
              
 
                 
              
 
                           @checker.options = @valid_options.merge(
              
 
                             'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
              
@@ -791,9 +784,16 @@ fire: hot
 
                           expect(stub).to have_been_requested
              
 
                         end
              
 
                 
              
 
                +        it "should use the Agent's `url` option if url_from_event is not set" do
              
 
                +          expect {
              
 
                +            @checker.options = @valid_options
              
 
                +            @checker.receive([@event])
              
 
                +          }.to change { Event.count }.by(1)
              
 
                +        end
              
 
                +
              
 
                         it "should allow url_from_event to be an array of urls" do
              
 
                -          stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
              
 
                -          stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com')
              
 
                +          stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Ffoo.com')
              
 
                +          stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Ffoo.com')
              
 
                 
              
 
                           @checker.options = @valid_options.merge(
              
 
                             'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}']
              
@@ -805,7 +805,10 @@ fire: hot
 
                         end
              
 
                 
              
 
                         it "should interpolate values from incoming event payload" do
              
 
                +          stub_request(:any, /foo/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/xkcd.html")), status: 200)
              
 
                +
              
 
                           expect {
              
 
                +            @valid_options['url_from_event'] = '{{ url }}'
              
 
                             @valid_options['extract'] = {
              
 
                               'from' => {
              
 
                                 'xpath' => '*[1]',
              
@@ -821,7 +824,7 @@ fire: hot
 
                           }.to change { Event.count }.by(1)
              
 
                 
              
 
                           expect(Event.last.payload).to eq({
              
 
                -            'from' => 'http://xkcd.com',
              
 
                +            'from' => 'http://foo.com',
              
 
                             'to' => 'http://dynamic.xkcd.com/random/comic/',
              
 
                           })
              
 
                         end
              
@@ -1075,7 +1078,6 @@ fire: hot
 
                         event = @events[6]
              
 
                         expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
              
 
                       end
              
 
                -
              
 
                     end
              
 
                   end
              
 
                 end