9 年之前 · 5eeca709c7
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -20,7 +20,12 @@ module Agents
 
                 
              
 
                       `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
              
 
                 
              
 
                -      The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload, or if you set `url_from_event` it is used as a Liquid template to generate the url to access. If you specify `merge` as the `mode`, it will retain the old payload and update it with the new values.
              
 
                +      The WebsiteAgent can also scrape based on incoming events.
              
 
                +
              
 
                +      * If the Event contains a `url` key, that URL will be fetched.
              
 
                +      * For more control, you can set the `url_from_event` option and it will be used as a Liquid template to generate the url to access based on the Event.
              
 
                +      * If you set `event_data_path` to the [JSONPath](http://goessner.net/articles/JsonPath/) of content in the Event, that will be used directly without fetching any URL.
              
 
                +      * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with the new values.
              
 
                 
              
 
                       # Supported Document Types
              
 
                 
              
@@ -140,7 +145,7 @@ module Agents
 
                 
              
 
                     def validate_options
              
 
                       # Check for required fields
              
 
                -      errors.add(:base, "either url or url_from_event is required") unless options['url'].present? || options['url_from_event'].present?
              
 
                +      errors.add(:base, "either url, url_from_event, or event_data_path are required") unless options['url'].present? || options['url_from_event'].present? || options['event_data_path'].present?
              
 
                       errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present?
              
 
                       validate_extract_options!
              
 
                 
              
@@ -251,15 +256,15 @@ module Agents
 
                       check_urls(interpolated['url'])
              
 
                     end
              
 
                 
              
 
                -    def check_urls(in_url, payload = {})
              
 
                +    def check_urls(in_url, existing_payload = {})
              
 
                       return unless in_url.present?
              
 
                 
              
 
                       Array(in_url).each do |url|
              
 
                -        check_url(url, payload)
              
 
                +        check_url(url, existing_payload)
              
 
                       end
              
 
                     end
              
 
                 
              
 
                -    def check_url(url, payload = {})
              
 
                +    def check_url(url, existing_payload = {})
              
 
                       unless /\Ahttps?:\/\//i === url
              
 
                         error "Ignoring a non-HTTP url: #{url.inspect}"
              
 
                         return
              
@@ -271,70 +276,91 @@ module Agents
 
                 
              
 
                       interpolation_context.stack {
              
 
                         interpolation_context['_response_'] = ResponseDrop.new(response)
              
 
                -        body = response.body
              
 
                -        doc = parse(body)
              
 
                +        handle_data(response.body, response.env[:url], existing_payload)
              
 
                +      }
              
 
                +    rescue => e
              
 
                +      error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}"
              
 
                +    end
              
 
                 
              
 
                -        if extract_full_json?
              
 
                -          if store_payload!(previous_payloads(1), doc)
              
 
                -            log "Storing new result for '#{name}': #{doc.inspect}"
              
 
                -            create_event payload: payload.merge(doc)
              
 
                -          end
              
 
                -          return
              
 
                +    def handle_data(body, url, existing_payload)
              
 
                +      doc = parse(body)
              
 
                +
              
 
                +      if extract_full_json?
              
 
                +        if store_payload!(previous_payloads(1), doc)
              
 
                +          log "Storing new result for '#{name}': #{doc.inspect}"
              
 
                +          create_event payload: existing_payload.merge(doc)
              
 
                         end
              
 
                +        return
              
 
                +      end
              
 
                 
              
 
                -        output =
              
 
                -          case extraction_type
              
 
                +      output =
              
 
                +        case extraction_type
              
 
                           when 'json'
              
 
                             extract_json(doc)
              
 
                           when 'text'
              
 
                             extract_text(doc)
              
 
                           else
              
 
                             extract_xml(doc)
              
 
                -          end
              
 
                +        end
              
 
                 
              
 
                -        num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
              
 
                +      num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
              
 
                 
              
 
                -        if num_unique_lengths.length != 1
              
 
                -          raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
              
 
                -        end
              
 
                +      if num_unique_lengths.length != 1
              
 
                +        raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
              
 
                +      end
              
 
                 
              
 
                -        old_events = previous_payloads num_unique_lengths.first
              
 
                -        num_unique_lengths.first.times do |index|
              
 
                -          result = {}
              
 
                -          interpolated['extract'].keys.each do |name|
              
 
                -            result[name] = output[name][index]
              
 
                -            if name.to_s == 'url'
              
 
                -              result[name] = (response.env[:url] + Utils.normalize_uri(result[name])).to_s
              
 
                -            end
              
 
                +      old_events = previous_payloads num_unique_lengths.first
              
 
                +      num_unique_lengths.first.times do |index|
              
 
                +        result = {}
              
 
                +        interpolated['extract'].keys.each do |name|
              
 
                +          result[name] = output[name][index]
              
 
                +          if name.to_s == 'url' && url.present?
              
 
                +            result[name] = (url + Utils.normalize_uri(result[name])).to_s
              
 
                           end
              
 
                +        end
              
 
                 
              
 
                -          if store_payload!(old_events, result)
              
 
                -            log "Storing new parsed result for '#{name}': #{result.inspect}"
              
 
                -            create_event payload: payload.merge(result)
              
 
                -          end
              
 
                +        if store_payload!(old_events, result)
              
 
                +          log "Storing new parsed result for '#{name}': #{result.inspect}"
              
 
                +          create_event payload: existing_payload.merge(result)
              
 
                         end
              
 
                -      }
              
 
                -    rescue => e
              
 
                -      error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}"
              
 
                +      end
              
 
                     end
              
 
                 
              
 
                     def receive(incoming_events)
              
 
                       incoming_events.each do |event|
              
 
                         interpolate_with(event) do
              
 
                -          url_to_scrape =
              
 
                -            if url_template = options['url_from_event'].presence
              
 
                -              interpolate_options(url_template)
              
 
                +          existing_payload = interpolated['mode'].to_s == "merge" ? event.payload : {}
              
 
                +
              
 
                +          if event_data_path = options['event_data_path'].presence
              
 
                +            data = Utils.value_at(event.payload, interpolate_options(event_data_path))
              
 
                +            if data.present?
              
 
                +              handle_event_data(data, event, existing_payload)
              
 
                             else
              
 
                -              event.payload['url']
              
 
                +              error "No data was found in the Event payload at the JSONPath #{interpolate_options(event_data_path)}", inbound_event: event
              
 
                             end
              
 
                -          check_urls(url_to_scrape,
              
 
                -                    interpolated['mode'].to_s == "merge" ? event.payload : {})
              
 
                +          else
              
 
                +            url_to_scrape =
              
 
                +              if event_data_path = options['event_data_path'].presence
              
 
                +                interpolate_options(event_data_path)
              
 
                +              elsif url_template = options['url_from_event'].presence
              
 
                +                interpolate_options(url_template)
              
 
                +              else
              
 
                +                event.payload['url']
              
 
                +              end
              
 
                +            check_urls(url_to_scrape, existing_payload)
              
 
                +          end
              
 
                         end
              
 
                       end
              
 
                     end
              
 
                 
              
 
                     private
              
 
                 
              
 
                +    def handle_event_data(data, event, existing_payload)
              
 
                +      handle_data(data, event.payload['url'], existing_payload)
              
 
                +    rescue => e
              
 
                +      error "Error when handling event data: #{e.message}\n#{e.backtrace.join("\n")}", inbound_event: event
              
 
                +    end
              
 
                +
              
 
                     # This method returns true if the result should be stored as a new event.
              
 
                     # If mode is set to 'on_change', this method may return false and update an existing
              
 
                     # event to expire further in the future.
              
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -763,92 +763,186 @@ fire: hot
 
                     end
              
 
                 
              
 
                     describe "#receive" do
              
 
                -      before do
              
 
                -        @event = Event.new
              
 
                -        @event.agent = agents(:bob_rain_notifier_agent)
              
 
                -        @event.payload = {
              
 
                -          'url' => 'http://xkcd.com',
              
 
                -          'link' => 'Random',
              
 
                -        }
              
 
                -      end
              
 
                +      describe "with a url or url_from_event" do
              
 
                +        before do
              
 
                +          @event = Event.new
              
 
                +          @event.agent = agents(:bob_rain_notifier_agent)
              
 
                +          @event.payload = {
              
 
                +            'url' => 'http://xkcd.com',
              
 
                +            'link' => 'Random',
              
 
                +          }
              
 
                +        end
              
 
                 
              
 
                -      it "should scrape from the url element in incoming event payload" do
              
 
                -        expect {
              
 
                -          @checker.options = @valid_options
              
 
                +        it "should scrape from the url element in incoming event payload" do
              
 
                +          expect {
              
 
                +            @checker.options = @valid_options
              
 
                +            @checker.receive([@event])
              
 
                +          }.to change { Event.count }.by(1)
              
 
                +        end
              
 
                +
              
 
                +        it "should use url_from_event as url to scrape if it exists when receiving an event" do
              
 
                +          stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
              
 
                +
              
 
                +          @checker.options = @valid_options.merge(
              
 
                +            'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
              
 
                +          )
              
 
                           @checker.receive([@event])
              
 
                -        }.to change { Event.count }.by(1)
              
 
                -      end
              
 
                 
              
 
                -      it "should use url_from_event as url to scrape if it exists when receiving an event" do
              
 
                -        stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
              
 
                +          expect(stub).to have_been_requested
              
 
                +        end
              
 
                 
              
 
                -        @checker.options = @valid_options.merge(
              
 
                -          'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
              
 
                -        )
              
 
                -        @checker.receive([@event])
              
 
                +        it "should allow url_from_event to be an array of urls" do
              
 
                +          stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
              
 
                +          stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com')
              
 
                 
              
 
                -        expect(stub).to have_been_requested
              
 
                -      end
              
 
                +          @checker.options = @valid_options.merge(
              
 
                +            'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}']
              
 
                +          )
              
 
                +          @checker.receive([@event])
              
 
                 
              
 
                -      it "should allow url_from_event to be an array of urls" do
              
 
                -        stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
              
 
                -        stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com')
              
 
                +          expect(stub1).to have_been_requested
              
 
                +          expect(stub2).to have_been_requested
              
 
                +        end
              
 
                 
              
 
                -        @checker.options = @valid_options.merge(
              
 
                -          'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}']
              
 
                -        )
              
 
                -        @checker.receive([@event])
              
 
                +        it "should interpolate values from incoming event payload" do
              
 
                +          expect {
              
 
                +            @valid_options['extract'] = {
              
 
                +              'from' => {
              
 
                +                'xpath' => '*[1]',
              
 
                +                'value' => '{{url | to_xpath}}'
              
 
                +              },
              
 
                +              'to' => {
              
 
                +                'xpath' => '(//a[@href and text()={{link | to_xpath}}])[1]',
              
 
                +                'value' => '@href'
              
 
                +              },
              
 
                +            }
              
 
                +            @checker.options = @valid_options
              
 
                +            @checker.receive([@event])
              
 
                +          }.to change { Event.count }.by(1)
              
 
                 
              
 
                -        expect(stub1).to have_been_requested
              
 
                -        expect(stub2).to have_been_requested
              
 
                -      end
              
 
                +          expect(Event.last.payload).to eq({
              
 
                +            'from' => 'http://xkcd.com',
              
 
                +            'to' => 'http://dynamic.xkcd.com/random/comic/',
              
 
                +          })
              
 
                +        end
              
 
                 
              
 
                -      it "should interpolate values from incoming event payload" do
              
 
                -        expect {
              
 
                -          @valid_options['extract'] = {
              
 
                -            'from' => {
              
 
                -              'xpath' => '*[1]',
              
 
                -              'value' => '{{url | to_xpath}}'
              
 
                -            },
              
 
                -            'to' => {
              
 
                -              'xpath' => '(//a[@href and text()={{link | to_xpath}}])[1]',
              
 
                -              'value' => '@href'
              
 
                -            },
              
 
                -          }
              
 
                -          @checker.options = @valid_options
              
 
                -          @checker.receive([@event])
              
 
                -        }.to change { Event.count }.by(1)
              
 
                +        it "should interpolate values from incoming event payload and _response_" do
              
 
                +          @event.payload['title'] = 'XKCD'
              
 
                 
              
 
                -        expect(Event.last.payload).to eq({
              
 
                -          'from' => 'http://xkcd.com',
              
 
                -          'to' => 'http://dynamic.xkcd.com/random/comic/',
              
 
                -        })
              
 
                -      end
              
 
                +          expect {
              
 
                +            @valid_options['extract'] = {
              
 
                +              'response_info' => @valid_options['extract']['url'].merge(
              
 
                +                'value' => '{% capture sentence %}The reponse from {{title}} was {{_response_.status}} {{_response_.headers.X-Status-Message}}.{% endcapture %}{{sentence | to_xpath}}'
              
 
                +              )
              
 
                +            }
              
 
                +            @checker.options = @valid_options
              
 
                +            @checker.receive([@event])
              
 
                +          }.to change { Event.count }.by(1)
              
 
                 
              
 
                -      it "should interpolate values from incoming event payload and _response_" do
              
 
                -        @event.payload['title'] = 'XKCD'
              
 
                +          expect(Event.last.payload['response_info']).to eq('The reponse from XKCD was 200 OK.')
              
 
                +        end
              
 
                 
              
 
                -        expect {
              
 
                -          @valid_options['extract'] = {
              
 
                -            'response_info' => @valid_options['extract']['url'].merge(
              
 
                -              'value' => '{% capture sentence %}The reponse from {{title}} was {{_response_.status}} {{_response_.headers.X-Status-Message}}.{% endcapture %}{{sentence | to_xpath}}'
              
 
                +        it "should support merging of events" do
              
 
                +          expect {
              
 
                +            @checker.options = @valid_options
              
 
                +            @checker.options[:mode] = "merge"
              
 
                +            @checker.receive([@event])
              
 
                +          }.to change { Event.count }.by(1)
              
 
                +          last_payload = Event.last.payload
              
 
                +          expect(last_payload['link']).to eq('Random')
              
 
                +        end
              
 
                +      end
              
 
                +
              
 
                +      describe "with a event_data_path" do
              
 
                +        describe "with json data" do
              
 
                +          before do
              
 
                +            @event = Event.new
              
 
                +            @event.agent = agents(:bob_rain_notifier_agent)
              
 
                +            @event.payload = {
              
 
                +              'something' => 'some value',
              
 
                +              'some_object' => {
              
 
                +                'some_data' => { hello: 'world' }.to_json
              
 
                +              }
              
 
                +            }
              
 
                +            @event.save!
              
 
                +
              
 
                +            @checker.options = @valid_options.merge(
              
 
                +              'type' => 'json',
              
 
                +              'event_data_path' => 'some_object.some_data',
              
 
                +              'extract' => {
              
 
                +                'value' => { 'path' => 'hello' }
              
 
                +              }
              
 
                +            )
              
 
                +          end
              
 
                +
              
 
                +          it "should extract from the event data in the incoming event payload" do
              
 
                +            expect {
              
 
                +              @checker.receive([@event])
              
 
                +            }.to change { Event.count }.by(1)
              
 
                +            expect(@checker.events.last.payload).to eq({ 'value' => 'world' })
              
 
                +          end
              
 
                +
              
 
                +          it "should support merge mode" do
              
 
                +            @checker.options['mode'] = "merge"
              
 
                +
              
 
                +            expect {
              
 
                +              @checker.receive([@event])
              
 
                +            }.to change { Event.count }.by(1)
              
 
                +            expect(@checker.events.last.payload).to eq(@event.payload.merge('value' => 'world'))
              
 
                +          end
              
 
                +
              
 
                +          it "should output an error when nothing can be found at the path" do
              
 
                +            @checker.options = @checker.options.merge(
              
 
                +              'event_data_path' => 'some_object.mistake'
              
 
                             )
              
 
                -          }
              
 
                -          @checker.options = @valid_options
              
 
                -          @checker.receive([@event])
              
 
                -        }.to change { Event.count }.by(1)
              
 
                 
              
 
                -        expect(Event.last.payload['response_info']).to eq('The reponse from XKCD was 200 OK.')
              
 
                -      end
              
 
                +            expect {
              
 
                +              @checker.receive([@event])
              
 
                +            }.to_not change { Event.count }
              
 
                 
              
 
                -      it "should support merging of events" do
              
 
                -        expect {
              
 
                -          @checker.options = @valid_options
              
 
                -          @checker.options[:mode] = "merge"
              
 
                -          @checker.receive([@event])
              
 
                -        }.to change { Event.count }.by(1)
              
 
                -        last_payload = Event.last.payload
              
 
                -        expect(last_payload['link']).to eq('Random')
              
 
                +            expect(@checker.logs.last.message).to match(/No data was found in the Event payload at the JSONPath some_object.mistake/)
              
 
                +          end
              
 
                +
              
 
                +          it "should output an error when the data cannot be parsed" do
              
 
                +            @event.update_attribute :payload, @event.payload.merge('some_object' => { 'some_data' => '{invalid json' })
              
 
                +
              
 
                +            expect {
              
 
                +              @checker.receive([@event])
              
 
                +            }.to_not change { Event.count }
              
 
                +
              
 
                +            expect(@checker.logs.last.message).to match(/Error when handling event data:/)
              
 
                +          end
              
 
                +        end
              
 
                +
              
 
                +        describe "with HTML data" do
              
 
                +          before do
              
 
                +            @event = Event.new
              
 
                +            @event.agent = agents(:bob_rain_notifier_agent)
              
 
                +            @event.payload = {
              
 
                +              'url' => 'http://xkcd.com',
              
 
                +              'some_object' => {
              
 
                +                'some_data' => "<div><span class='title'>Title!</span><span class='body'>Body!</span></div>"
              
 
                +              }
              
 
                +            }
              
 
                +            @event.save!
              
 
                +
              
 
                +            @checker.options = @valid_options.merge(
              
 
                +              'type' => 'html',
              
 
                +              'event_data_path' => 'some_object.some_data',
              
 
                +              'extract' => {
              
 
                +                'title' => { 'css' => ".title", 'value' => ".//text()" },
              
 
                +                'body' => { 'css' => "div span.body", 'value' => ".//text()" }
              
 
                +              }
              
 
                +            )
              
 
                +          end
              
 
                +
              
 
                +          it "should extract from the event data in the incoming event payload" do
              
 
                +            expect {
              
 
                +              @checker.receive([@event])
              
 
                +            }.to change { Event.count }.by(1)
              
 
                +            expect(@checker.events.last.payload).to eq({ 'title' => 'Title!', 'body' => 'Body!' })
              
 
                +          end
              
 
                +        end
              
 
                       end
              
 
                     end
              
 
                   end