Merge pull request #1161 from cantino/website_agent_can_parse_payload

Add event_data_path option to the WebsiteAgent to handle data directly in incoming Events

Andrew Cantino 8 years ago
parent
commit
3da033eaa3
3 changed files with 231 additions and 112 deletions
  1. 1 0
      CHANGES.md
  2. 65 41
      app/models/agents/website_agent.rb
  3. 165 71
      spec/models/agents/website_agent_spec.rb

+ 1 - 0
CHANGES.md

@@ -1,5 +1,6 @@
1 1
 # Changes
2 2
 
3
+* Dec 26, 2015   - WebsiteAgent can accept a `data_from_event` Liquid template instead of a URL.
3 4
 * Oct 17, 2015   - TwitterSearchAgent added for running period Twitter searches.
4 5
 * Oct 17, 2015   - GapDetectorAgent added to alert when no data has been seen in a certain period of time.
5 6
 * Oct 12, 2015   - Slack agent supports attachments.

+ 65 - 41
app/models/agents/website_agent.rb

@@ -20,7 +20,12 @@ module Agents
20 20
 
21 21
       `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
22 22
 
23
-      The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload, or if you set `url_from_event` it is used as a Liquid template to generate the url to access. If you specify `merge` as the `mode`, it will retain the old payload and update it with the new values.
23
+      The WebsiteAgent can also scrape based on incoming events.
24
+
25
+      * If the Event contains a `url` key, that URL will be fetched.
26
+      * For more control, you can set the `url_from_event` option and it will be used as a Liquid template to generate the url to access based on the Event.
27
+      * If you set `data_from_event` to a Liquid template, it will be used to generate the data directly without fetching any URL.  (For example, set it to `{{ html }}` to use HTML contained in the `html` key of the incoming Event.)
28
+      * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with the new values.
24 29
 
25 30
       # Supported Document Types
26 31
 
@@ -140,7 +145,7 @@ module Agents
140 145
 
141 146
     def validate_options
142 147
       # Check for required fields
143
-      errors.add(:base, "either url or url_from_event is required") unless options['url'].present? || options['url_from_event'].present?
148
+      errors.add(:base, "either url, url_from_event, or data_from_event are required") unless options['url'].present? || options['url_from_event'].present? || options['data_from_event'].present?
144 149
       errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present?
145 150
       validate_extract_options!
146 151
 
@@ -251,15 +256,15 @@ module Agents
251 256
       check_urls(interpolated['url'])
252 257
     end
253 258
 
254
-    def check_urls(in_url, payload = {})
259
+    def check_urls(in_url, existing_payload = {})
255 260
       return unless in_url.present?
256 261
 
257 262
       Array(in_url).each do |url|
258
-        check_url(url, payload)
263
+        check_url(url, existing_payload)
259 264
       end
260 265
     end
261 266
 
262
-    def check_url(url, payload = {})
267
+    def check_url(url, existing_payload = {})
263 268
       unless /\Ahttps?:\/\//i === url
264 269
         error "Ignoring a non-HTTP url: #{url.inspect}"
265 270
         return
@@ -271,70 +276,89 @@ module Agents
271 276
 
272 277
       interpolation_context.stack {
273 278
         interpolation_context['_response_'] = ResponseDrop.new(response)
274
-        body = response.body
275
-        doc = parse(body)
279
+        handle_data(response.body, response.env[:url], existing_payload)
280
+      }
281
+    rescue => e
282
+      error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}"
283
+    end
276 284
 
277
-        if extract_full_json?
278
-          if store_payload!(previous_payloads(1), doc)
279
-            log "Storing new result for '#{name}': #{doc.inspect}"
280
-            create_event payload: payload.merge(doc)
281
-          end
282
-          return
285
+    def handle_data(body, url, existing_payload)
286
+      doc = parse(body)
287
+
288
+      if extract_full_json?
289
+        if store_payload!(previous_payloads(1), doc)
290
+          log "Storing new result for '#{name}': #{doc.inspect}"
291
+          create_event payload: existing_payload.merge(doc)
283 292
         end
293
+        return
294
+      end
284 295
 
285
-        output =
286
-          case extraction_type
296
+      output =
297
+        case extraction_type
287 298
           when 'json'
288 299
             extract_json(doc)
289 300
           when 'text'
290 301
             extract_text(doc)
291 302
           else
292 303
             extract_xml(doc)
293
-          end
304
+        end
294 305
 
295
-        num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
306
+      num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
296 307
 
297
-        if num_unique_lengths.length != 1
298
-          raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
299
-        end
308
+      if num_unique_lengths.length != 1
309
+        raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
310
+      end
300 311
 
301
-        old_events = previous_payloads num_unique_lengths.first
302
-        num_unique_lengths.first.times do |index|
303
-          result = {}
304
-          interpolated['extract'].keys.each do |name|
305
-            result[name] = output[name][index]
306
-            if name.to_s == 'url'
307
-              result[name] = (response.env[:url] + Utils.normalize_uri(result[name])).to_s
308
-            end
312
+      old_events = previous_payloads num_unique_lengths.first
313
+      num_unique_lengths.first.times do |index|
314
+        result = {}
315
+        interpolated['extract'].keys.each do |name|
316
+          result[name] = output[name][index]
317
+          if name.to_s == 'url' && url.present?
318
+            result[name] = (url + Utils.normalize_uri(result[name])).to_s
309 319
           end
320
+        end
310 321
 
311
-          if store_payload!(old_events, result)
312
-            log "Storing new parsed result for '#{name}': #{result.inspect}"
313
-            create_event payload: payload.merge(result)
314
-          end
322
+        if store_payload!(old_events, result)
323
+          log "Storing new parsed result for '#{name}': #{result.inspect}"
324
+          create_event payload: existing_payload.merge(result)
315 325
         end
316
-      }
317
-    rescue => e
318
-      error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}"
326
+      end
319 327
     end
320 328
 
321 329
     def receive(incoming_events)
322 330
       incoming_events.each do |event|
323 331
         interpolate_with(event) do
324
-          url_to_scrape =
325
-            if url_template = options['url_from_event'].presence
326
-              interpolate_options(url_template)
332
+          existing_payload = interpolated['mode'].to_s == "merge" ? event.payload : {}
333
+
334
+          if data_from_event = options['data_from_event'].presence
335
+            data = interpolate_options(data_from_event)
336
+            if data.present?
337
+              handle_event_data(data, event, existing_payload)
327 338
             else
328
-              event.payload['url']
339
+              error "No data was found in the Event payload using the template #{data_from_event}", inbound_event: event
329 340
             end
330
-          check_urls(url_to_scrape,
331
-                    interpolated['mode'].to_s == "merge" ? event.payload : {})
341
+          else
342
+            url_to_scrape =
343
+              if url_template = options['url_from_event'].presence
344
+                interpolate_options(url_template)
345
+              else
346
+                event.payload['url']
347
+              end
348
+            check_urls(url_to_scrape, existing_payload)
349
+          end
332 350
         end
333 351
       end
334 352
     end
335 353
 
336 354
     private
337 355
 
356
+    def handle_event_data(data, event, existing_payload)
357
+      handle_data(data, event.payload['url'], existing_payload)
358
+    rescue => e
359
+      error "Error when handling event data: #{e.message}\n#{e.backtrace.join("\n")}", inbound_event: event
360
+    end
361
+
338 362
     # This method returns true if the result should be stored as a new event.
339 363
     # If mode is set to 'on_change', this method may return false and update an existing
340 364
     # event to expire further in the future.

+ 165 - 71
spec/models/agents/website_agent_spec.rb

@@ -763,92 +763,186 @@ fire: hot
763 763
     end
764 764
 
765 765
     describe "#receive" do
766
-      before do
767
-        @event = Event.new
768
-        @event.agent = agents(:bob_rain_notifier_agent)
769
-        @event.payload = {
770
-          'url' => 'http://xkcd.com',
771
-          'link' => 'Random',
772
-        }
773
-      end
766
+      describe "with a url or url_from_event" do
767
+        before do
768
+          @event = Event.new
769
+          @event.agent = agents(:bob_rain_notifier_agent)
770
+          @event.payload = {
771
+            'url' => 'http://xkcd.com',
772
+            'link' => 'Random',
773
+          }
774
+        end
774 775
 
775
-      it "should scrape from the url element in incoming event payload" do
776
-        expect {
777
-          @checker.options = @valid_options
776
+        it "should scrape from the url element in incoming event payload" do
777
+          expect {
778
+            @checker.options = @valid_options
779
+            @checker.receive([@event])
780
+          }.to change { Event.count }.by(1)
781
+        end
782
+
783
+        it "should use url_from_event as url to scrape if it exists when receiving an event" do
784
+          stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
785
+
786
+          @checker.options = @valid_options.merge(
787
+            'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
788
+          )
778 789
           @checker.receive([@event])
779
-        }.to change { Event.count }.by(1)
780
-      end
781 790
 
782
-      it "should use url_from_event as url to scrape if it exists when receiving an event" do
783
-        stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
791
+          expect(stub).to have_been_requested
792
+        end
784 793
 
785
-        @checker.options = @valid_options.merge(
786
-          'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
787
-        )
788
-        @checker.receive([@event])
794
+        it "should allow url_from_event to be an array of urls" do
795
+          stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
796
+          stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com')
789 797
 
790
-        expect(stub).to have_been_requested
791
-      end
798
+          @checker.options = @valid_options.merge(
799
+            'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}']
800
+          )
801
+          @checker.receive([@event])
792 802
 
793
-      it "should allow url_from_event to be an array of urls" do
794
-        stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
795
-        stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com')
803
+          expect(stub1).to have_been_requested
804
+          expect(stub2).to have_been_requested
805
+        end
796 806
 
797
-        @checker.options = @valid_options.merge(
798
-          'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}']
799
-        )
800
-        @checker.receive([@event])
807
+        it "should interpolate values from incoming event payload" do
808
+          expect {
809
+            @valid_options['extract'] = {
810
+              'from' => {
811
+                'xpath' => '*[1]',
812
+                'value' => '{{url | to_xpath}}'
813
+              },
814
+              'to' => {
815
+                'xpath' => '(//a[@href and text()={{link | to_xpath}}])[1]',
816
+                'value' => '@href'
817
+              },
818
+            }
819
+            @checker.options = @valid_options
820
+            @checker.receive([@event])
821
+          }.to change { Event.count }.by(1)
801 822
 
802
-        expect(stub1).to have_been_requested
803
-        expect(stub2).to have_been_requested
804
-      end
823
+          expect(Event.last.payload).to eq({
824
+            'from' => 'http://xkcd.com',
825
+            'to' => 'http://dynamic.xkcd.com/random/comic/',
826
+          })
827
+        end
805 828
 
806
-      it "should interpolate values from incoming event payload" do
807
-        expect {
808
-          @valid_options['extract'] = {
809
-            'from' => {
810
-              'xpath' => '*[1]',
811
-              'value' => '{{url | to_xpath}}'
812
-            },
813
-            'to' => {
814
-              'xpath' => '(//a[@href and text()={{link | to_xpath}}])[1]',
815
-              'value' => '@href'
816
-            },
817
-          }
818
-          @checker.options = @valid_options
819
-          @checker.receive([@event])
820
-        }.to change { Event.count }.by(1)
829
+        it "should interpolate values from incoming event payload and _response_" do
830
+          @event.payload['title'] = 'XKCD'
821 831
 
822
-        expect(Event.last.payload).to eq({
823
-          'from' => 'http://xkcd.com',
824
-          'to' => 'http://dynamic.xkcd.com/random/comic/',
825
-        })
826
-      end
832
+          expect {
833
+            @valid_options['extract'] = {
834
+              'response_info' => @valid_options['extract']['url'].merge(
835
+                'value' => '{% capture sentence %}The reponse from {{title}} was {{_response_.status}} {{_response_.headers.X-Status-Message}}.{% endcapture %}{{sentence | to_xpath}}'
836
+              )
837
+            }
838
+            @checker.options = @valid_options
839
+            @checker.receive([@event])
840
+          }.to change { Event.count }.by(1)
827 841
 
828
-      it "should interpolate values from incoming event payload and _response_" do
829
-        @event.payload['title'] = 'XKCD'
842
+          expect(Event.last.payload['response_info']).to eq('The reponse from XKCD was 200 OK.')
843
+        end
830 844
 
831
-        expect {
832
-          @valid_options['extract'] = {
833
-            'response_info' => @valid_options['extract']['url'].merge(
834
-              'value' => '{% capture sentence %}The reponse from {{title}} was {{_response_.status}} {{_response_.headers.X-Status-Message}}.{% endcapture %}{{sentence | to_xpath}}'
845
+        it "should support merging of events" do
846
+          expect {
847
+            @checker.options = @valid_options
848
+            @checker.options[:mode] = "merge"
849
+            @checker.receive([@event])
850
+          }.to change { Event.count }.by(1)
851
+          last_payload = Event.last.payload
852
+          expect(last_payload['link']).to eq('Random')
853
+        end
854
+      end
855
+
856
+      describe "with a data_from_event" do
857
+        describe "with json data" do
858
+          before do
859
+            @event = Event.new
860
+            @event.agent = agents(:bob_rain_notifier_agent)
861
+            @event.payload = {
862
+              'something' => 'some value',
863
+              'some_object' => {
864
+                'some_data' => { hello: 'world' }.to_json
865
+              }
866
+            }
867
+            @event.save!
868
+
869
+            @checker.options = @valid_options.merge(
870
+              'type' => 'json',
871
+              'data_from_event' => '{{ some_object.some_data }}',
872
+              'extract' => {
873
+                'value' => { 'path' => 'hello' }
874
+              }
875
+            )
876
+          end
877
+
878
+          it "should extract from the event data in the incoming event payload" do
879
+            expect {
880
+              @checker.receive([@event])
881
+            }.to change { Event.count }.by(1)
882
+            expect(@checker.events.last.payload).to eq({ 'value' => 'world' })
883
+          end
884
+
885
+          it "should support merge mode" do
886
+            @checker.options['mode'] = "merge"
887
+
888
+            expect {
889
+              @checker.receive([@event])
890
+            }.to change { Event.count }.by(1)
891
+            expect(@checker.events.last.payload).to eq(@event.payload.merge('value' => 'world'))
892
+          end
893
+
894
+          it "should output an error when nothing can be found at the path" do
895
+            @checker.options = @checker.options.merge(
896
+              'data_from_event' => '{{ some_object.mistake }}'
835 897
             )
836
-          }
837
-          @checker.options = @valid_options
838
-          @checker.receive([@event])
839
-        }.to change { Event.count }.by(1)
840 898
 
841
-        expect(Event.last.payload['response_info']).to eq('The reponse from XKCD was 200 OK.')
842
-      end
899
+            expect {
900
+              @checker.receive([@event])
901
+            }.to_not change { Event.count }
843 902
 
844
-      it "should support merging of events" do
845
-        expect {
846
-          @checker.options = @valid_options
847
-          @checker.options[:mode] = "merge"
848
-          @checker.receive([@event])
849
-        }.to change { Event.count }.by(1)
850
-        last_payload = Event.last.payload
851
-        expect(last_payload['link']).to eq('Random')
903
+            expect(@checker.logs.last.message).to match(/No data was found in the Event payload using the template {{ some_object\.mistake }}/)
904
+          end
905
+
906
+          it "should output an error when the data cannot be parsed" do
907
+            @event.update_attribute :payload, @event.payload.merge('some_object' => { 'some_data' => '{invalid json' })
908
+
909
+            expect {
910
+              @checker.receive([@event])
911
+            }.to_not change { Event.count }
912
+
913
+            expect(@checker.logs.last.message).to match(/Error when handling event data:/)
914
+          end
915
+        end
916
+
917
+        describe "with HTML data" do
918
+          before do
919
+            @event = Event.new
920
+            @event.agent = agents(:bob_rain_notifier_agent)
921
+            @event.payload = {
922
+              'url' => 'http://xkcd.com',
923
+              'some_object' => {
924
+                'some_data' => "<div><span class='title'>Title!</span><span class='body'>Body!</span></div>"
925
+              }
926
+            }
927
+            @event.save!
928
+
929
+            @checker.options = @valid_options.merge(
930
+              'type' => 'html',
931
+              'data_from_event' => '{{ some_object.some_data }}',
932
+              'extract' => {
933
+                'title' => { 'css' => ".title", 'value' => ".//text()" },
934
+                'body' => { 'css' => "div span.body", 'value' => ".//text()" }
935
+              }
936
+            )
937
+          end
938
+
939
+          it "should extract from the event data in the incoming event payload" do
940
+            expect {
941
+              @checker.receive([@event])
942
+            }.to change { Event.count }.by(1)
943
+            expect(@checker.events.last.payload).to eq({ 'title' => 'Title!', 'body' => 'Body!' })
944
+          end
945
+        end
852 946
       end
853 947
     end
854 948
   end