@@ -18,14 +18,13 @@ module Agents |
||
18 | 18 |
|
19 | 19 |
Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all`, `on_change`, or `merge` (if fetching based on an Event, see below). |
20 | 20 |
|
21 |
- `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape) |
|
21 |
+ The `url` option can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape). |
|
22 | 22 |
|
23 | 23 |
The WebsiteAgent can also scrape based on incoming events. |
24 | 24 |
|
25 |
- * If the Event contains a `url` key, that URL will be fetched. |
|
26 |
- * For more control, you can set the `url_from_event` option and it will be used as a Liquid template to generate the url to access based on the Event. |
|
27 |
- * If you set `data_from_event` to a Liquid template, it will be used to generate the data directly without fetching any URL. (For example, set it to `{{ html }}` to use HTML contained in the `html` key of the incoming Event.) |
|
28 |
- * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with the new values. |
|
25 |
+ * Set the `url_from_event` option to a Liquid template to generate the url to access based on the Event. (To fetch the url in the Event's `url` key, for example, set `url_from_event` to `{{ url }}`.) |
|
26 |
+ * Alternatively, set `data_from_event` to a Liquid template to use data directly without fetching any URL. (For example, set it to `{{ html }}` to use HTML contained in the `html` key of the incoming Event.) |
|
27 |
+ * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with new values. |
|
29 | 28 |
|
30 | 29 |
# Supported Document Types |
31 | 30 |
|
@@ -343,7 +342,7 @@ module Agents |
||
343 | 342 |
if url_template = options['url_from_event'].presence |
344 | 343 |
interpolate_options(url_template) |
345 | 344 |
else |
346 |
- event.payload['url'].presence || interpolated['url'] |
|
345 |
+ interpolated['url'] |
|
347 | 346 |
end |
348 | 347 |
check_urls(url_to_scrape, existing_payload) |
349 | 348 |
end |
@@ -0,0 +1,22 @@ |
||
1 |
+class WebsiteAgentDoesNotUseEventUrl < ActiveRecord::Migration |
|
2 |
+ def up |
|
3 |
+ # Until this migration, if a WebsiteAgent received Events and did not have a `url_from_event` option set, |
|
4 |
+ # it would use the `url` from the Event's payload. If the Event did not have a `url` in its payload, the |
|
5 |
+ # WebsiteAgent would do nothing. This migration assumes that if someone has wired a WebsiteAgent to receive Events |
|
6 |
+ # and has not set `url_from_event` or `data_from_event`, they were trying to use the Event's `url` payload, so we |
|
7 |
+ # set `url_from_event` to `{{ url }}` for them. |
|
8 |
+ Agents::WebsiteAgent.find_each do |agent| |
|
9 |
+ next unless agent.sources.count > 0 |
|
10 |
+ |
|
11 |
+ if !agent.options['data_from_event'].present? && !agent.options['url_from_event'].present? |
|
12 |
+ agent.options['url_from_event'] = '{{ url }}' |
|
13 |
+ agent.save! |
|
14 |
+ puts ">> Setting `url_from_event` on WebsiteAgent##{agent.id} to {{ url }} because it is wired" |
|
15 |
+ puts ">> to receive Events, and the WebsiteAgent no longer uses the Event's `url` value directly." |
|
16 |
+ end |
|
17 |
+ end |
|
18 |
+ end |
|
19 |
+ |
|
20 |
+ def down |
|
21 |
+ end |
|
22 |
+end |
@@ -768,20 +768,13 @@ fire: hot |
||
768 | 768 |
@event = Event.new |
769 | 769 |
@event.agent = agents(:bob_rain_notifier_agent) |
770 | 770 |
@event.payload = { |
771 |
- 'url' => 'http://xkcd.com', |
|
771 |
+ 'url' => 'http://foo.com', |
|
772 | 772 |
'link' => 'Random' |
773 | 773 |
} |
774 | 774 |
end |
775 | 775 |
|
776 |
- it "should scrape from the url element in incoming event payload" do |
|
777 |
- expect { |
|
778 |
- @checker.options = @valid_options |
|
779 |
- @checker.receive([@event]) |
|
780 |
- }.to change { Event.count }.by(1) |
|
781 |
- end |
|
782 |
- |
|
783 |
- it "should use url_from_event as url to scrape if it exists when receiving an event" do |
|
784 |
- stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') |
|
776 |
+ it "should use url_from_event as the url to scrape" do |
|
777 |
+ stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Ffoo.com') |
|
785 | 778 |
|
786 | 779 |
@checker.options = @valid_options.merge( |
787 | 780 |
'url_from_event' => 'http://example.org/?url={{url | uri_escape}}' |
@@ -791,9 +784,16 @@ fire: hot |
||
791 | 784 |
expect(stub).to have_been_requested |
792 | 785 |
end |
793 | 786 |
|
787 |
+ it "should use the Agent's `url` option if url_from_event is not set" do |
|
788 |
+ expect { |
|
789 |
+ @checker.options = @valid_options |
|
790 |
+ @checker.receive([@event]) |
|
791 |
+ }.to change { Event.count }.by(1) |
|
792 |
+ end |
|
793 |
+ |
|
794 | 794 |
it "should allow url_from_event to be an array of urls" do |
795 |
- stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') |
|
796 |
- stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com') |
|
795 |
+ stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Ffoo.com') |
|
796 |
+ stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Ffoo.com') |
|
797 | 797 |
|
798 | 798 |
@checker.options = @valid_options.merge( |
799 | 799 |
'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}'] |
@@ -805,7 +805,10 @@ fire: hot |
||
805 | 805 |
end |
806 | 806 |
|
807 | 807 |
it "should interpolate values from incoming event payload" do |
808 |
+ stub_request(:any, /foo/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/xkcd.html")), status: 200) |
|
809 |
+ |
|
808 | 810 |
expect { |
811 |
+ @valid_options['url_from_event'] = '{{ url }}' |
|
809 | 812 |
@valid_options['extract'] = { |
810 | 813 |
'from' => { |
811 | 814 |
'xpath' => '*[1]', |
@@ -821,7 +824,7 @@ fire: hot |
||
821 | 824 |
}.to change { Event.count }.by(1) |
822 | 825 |
|
823 | 826 |
expect(Event.last.payload).to eq({ |
824 |
- 'from' => 'http://xkcd.com', |
|
827 |
+ 'from' => 'http://foo.com', |
|
825 | 828 |
'to' => 'http://dynamic.xkcd.com/random/comic/', |
826 | 829 |
}) |
827 | 830 |
end |
@@ -1075,7 +1078,6 @@ fire: hot |
||
1075 | 1078 |
event = @events[6] |
1076 | 1079 |
expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8") |
1077 | 1080 |
end |
1078 |
- |
|
1079 | 1081 |
end |
1080 | 1082 |
end |
1081 | 1083 |
end |