@@ -16,6 +16,8 @@ module Agents |
||
16 | 16 |
|
17 | 17 |
Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all` or `on_change`. |
18 | 18 |
|
19 |
+ `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape) |
|
20 |
+ |
|
19 | 21 |
The `type` value can be `xml`, `html`, or `json`. |
20 | 22 |
|
21 | 23 |
To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes. |
@@ -107,85 +109,97 @@ module Agents |
||
107 | 109 |
log "Fetching #{options['url']}" |
108 | 110 |
request_opts = { :followlocation => true } |
109 | 111 |
request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present? |
110 |
- request = Typhoeus::Request.new(options['url'], request_opts) |
|
111 | 112 |
|
112 |
- request.on_failure do |response| |
|
113 |
- error "Failed: #{response.inspect}" |
|
113 |
+ requests = [] |
|
114 |
+ |
|
115 |
+ if options['url'].kind_of?(Array) |
|
116 |
+ options['url'].each do |url| |
|
117 |
+ requests.push(Typhoeus::Request.new(url, request_opts)) |
|
118 |
+ end |
|
119 |
+ else |
|
120 |
+ requests.push(Typhoeus::Request.new(options['url'], request_opts)) |
|
114 | 121 |
end |
115 | 122 |
|
116 |
- request.on_success do |response| |
|
117 |
- body = response.body |
|
118 |
- if (encoding = options['force_encoding']).present? |
|
119 |
- body = body.encode(Encoding::UTF_8, encoding) |
|
123 |
+ requests.each do |request| |
|
124 |
+ request.on_failure do |response| |
|
125 |
+ error "Failed: #{response.inspect}" |
|
120 | 126 |
end |
121 |
- doc = parse(body) |
|
122 | 127 |
|
123 |
- if extract_full_json? |
|
124 |
- if store_payload!(previous_payloads(1), doc) |
|
125 |
- log "Storing new result for '#{name}': #{doc.inspect}" |
|
126 |
- create_event :payload => doc |
|
128 |
+ request.on_success do |response| |
|
129 |
+ body = response.body |
|
130 |
+ if (encoding = options['force_encoding']).present? |
|
131 |
+ body = body.encode(Encoding::UTF_8, encoding) |
|
127 | 132 |
end |
128 |
- else |
|
129 |
- output = {} |
|
130 |
- options['extract'].each do |name, extraction_details| |
|
131 |
- if extraction_type == "json" |
|
132 |
- result = Utils.values_at(doc, extraction_details['path']) |
|
133 |
- log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}" |
|
134 |
- else |
|
135 |
- case |
|
136 |
- when css = extraction_details['css'] |
|
137 |
- nodes = doc.css(css) |
|
138 |
- when xpath = extraction_details['xpath'] |
|
139 |
- nodes = doc.xpath(xpath) |
|
133 |
+ doc = parse(body) |
|
134 |
+ |
|
135 |
+ if extract_full_json? |
|
136 |
+ if store_payload!(previous_payloads(1), doc) |
|
137 |
+ log "Storing new result for '#{name}': #{doc.inspect}" |
|
138 |
+ create_event :payload => doc |
|
139 |
+ end |
|
140 |
+ else |
|
141 |
+ output = {} |
|
142 |
+ options['extract'].each do |name, extraction_details| |
|
143 |
+ if extraction_type == "json" |
|
144 |
+ result = Utils.values_at(doc, extraction_details['path']) |
|
145 |
+ log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}" |
|
140 | 146 |
else |
141 |
- error "'css' or 'xpath' is required for HTML or XML extraction" |
|
142 |
- return |
|
143 |
- end |
|
144 |
- unless Nokogiri::XML::NodeSet === nodes |
|
145 |
- error "The result of HTML/XML extraction was not a NodeSet" |
|
146 |
- return |
|
147 |
- end |
|
148 |
- result = nodes.map { |node| |
|
149 |
- if extraction_details['attr'] |
|
150 |
- node.attr(extraction_details['attr']) |
|
151 |
- elsif extraction_details['text'] |
|
152 |
- node.text() |
|
147 |
+ case |
|
148 |
+ when css = extraction_details['css'] |
|
149 |
+ nodes = doc.css(css) |
|
150 |
+ when xpath = extraction_details['xpath'] |
|
151 |
+ nodes = doc.xpath(xpath) |
|
153 | 152 |
else |
154 |
- error "'attr' or 'text' is required on HTML or XML extraction patterns" |
|
153 |
+ error "'css' or 'xpath' is required for HTML or XML extraction" |
|
155 | 154 |
return |
156 | 155 |
end |
157 |
- } |
|
158 |
- log "Extracting #{extraction_type} at #{xpath || css}: #{result}" |
|
156 |
+ unless Nokogiri::XML::NodeSet === nodes |
|
157 |
+ error "The result of HTML/XML extraction was not a NodeSet" |
|
158 |
+ return |
|
159 |
+ end |
|
160 |
+ result = nodes.map { |node| |
|
161 |
+ if extraction_details['attr'] |
|
162 |
+ node.attr(extraction_details['attr']) |
|
163 |
+ elsif extraction_details['text'] |
|
164 |
+ node.text() |
|
165 |
+ else |
|
166 |
+ error "'attr' or 'text' is required on HTML or XML extraction patterns" |
|
167 |
+ return |
|
168 |
+ end |
|
169 |
+ } |
|
170 |
+ log "Extracting #{extraction_type} at #{xpath || css}: #{result}" |
|
171 |
+ end |
|
172 |
+ output[name] = result |
|
159 | 173 |
end |
160 |
- output[name] = result |
|
161 |
- end |
|
162 | 174 |
|
163 |
- num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq |
|
175 |
+ num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq |
|
164 | 176 |
|
165 |
- if num_unique_lengths.length != 1 |
|
166 |
- error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}" |
|
167 |
- return |
|
168 |
- end |
|
169 |
- |
|
170 |
- old_events = previous_payloads num_unique_lengths.first |
|
171 |
- num_unique_lengths.first.times do |index| |
|
172 |
- result = {} |
|
173 |
- options['extract'].keys.each do |name| |
|
174 |
- result[name] = output[name][index] |
|
175 |
- if name.to_s == 'url' |
|
176 |
- result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil? |
|
177 |
- end |
|
177 |
+ if num_unique_lengths.length != 1 |
|
178 |
+ error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}" |
|
179 |
+ return |
|
178 | 180 |
end |
181 |
+ |
|
182 |
+ old_events = previous_payloads num_unique_lengths.first |
|
183 |
+ num_unique_lengths.first.times do |index| |
|
184 |
+ result = {} |
|
185 |
+ options['extract'].keys.each do |name| |
|
186 |
+ result[name] = output[name][index] |
|
187 |
+ if name.to_s == 'url' |
|
188 |
+ result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil? |
|
189 |
+ end |
|
190 |
+ end |
|
179 | 191 |
|
180 |
- if store_payload!(old_events, result) |
|
181 |
- log "Storing new parsed result for '#{name}': #{result.inspect}" |
|
182 |
- create_event :payload => result |
|
192 |
+ if store_payload!(old_events, result) |
|
193 |
+ log "Storing new parsed result for '#{name}': #{result.inspect}" |
|
194 |
+ create_event :payload => result |
|
195 |
+ end |
|
183 | 196 |
end |
184 | 197 |
end |
185 | 198 |
end |
199 |
+ |
|
200 |
+ hydra.queue request |
|
201 |
+ hydra.run |
|
186 | 202 |
end |
187 |
- hydra.queue request |
|
188 |
- hydra.run |
|
189 | 203 |
end |
190 | 204 |
|
191 | 205 |
private |
@@ -91,6 +91,30 @@ describe Agents::WebsiteAgent do |
||
91 | 91 |
@checker.check |
92 | 92 |
@checker.logs.first.message.should =~ /Got an uneven number of matches/ |
93 | 93 |
end |
94 |
+ |
|
95 |
+ it "should accept an array for url" do |
|
96 |
+ @site['url'] = ["http://xkcd.com/1/", "http://xkcd.com/2/"] |
|
97 |
+ @checker.options = @site |
|
98 |
+ lambda { @checker.save! }.should_not raise_error; |
|
99 |
+ lambda { @checker.check }.should_not raise_error; |
|
100 |
+ end |
|
101 |
+ |
|
102 |
+ it "should parse events from all urls in array" do |
|
103 |
+ lambda { |
|
104 |
+ @site['url'] = ["http://xkcd.com/", "http://xkcd.com/"] |
|
105 |
+ @site['mode'] = 'all' |
|
106 |
+ @checker.options = @site |
|
107 |
+ @checker.check |
|
108 |
+ }.should change { Event.count }.by(2) |
|
109 |
+ end |
|
110 |
+ |
|
111 |
+ it "should follow unique rules when parsing array of urls" do |
|
112 |
+ lambda { |
|
113 |
+ @site['url'] = ["http://xkcd.com/", "http://xkcd.com/"] |
|
114 |
+ @checker.options = @site |
|
115 |
+ @checker.check |
|
116 |
+ }.should change { Event.count }.by(1) |
|
117 |
+ end |
|
94 | 118 |
end |
95 | 119 |
|
96 | 120 |
describe 'encoding' do |