@@ -299,6 +299,12 @@ module Agents |
||
299 | 299 |
end).to_s |
300 | 300 |
end |
301 | 301 |
|
302 |
+ def use_namespaces? |
|
303 |
+ interpolated['extract'].none? { |name, extraction_details| |
|
304 |
+ extraction_details.key?('xpath') |
|
305 |
+ } |
|
306 |
+ end |
|
307 |
+ |
|
302 | 308 |
def extract_each(&block) |
303 | 309 |
interpolated['extract'].each_with_object({}) { |(name, extraction_details), output| |
304 | 310 |
output[name] = block.call(extraction_details) |
@@ -331,7 +337,6 @@ module Agents |
||
331 | 337 |
when css = extraction_details['css'] |
332 | 338 |
nodes = doc.css(css) |
333 | 339 |
when xpath = extraction_details['xpath'] |
334 |
- doc.remove_namespaces! # ignore xmlns, useful when parsing atom feeds |
|
335 | 340 |
nodes = doc.xpath(xpath) |
336 | 341 |
else |
337 | 342 |
raise '"css" or "xpath" is required for HTML or XML extraction' |
@@ -356,9 +361,12 @@ module Agents |
||
356 | 361 |
end |
357 | 362 |
|
358 | 363 |
def parse(data) |
359 |
- case extraction_type |
|
364 |
+ case type = extraction_type |
|
360 | 365 |
when "xml" |
361 |
- Nokogiri::XML(data) |
|
366 |
+ doc = Nokogiri::XML(data) |
|
367 |
+ # ignore xmlns, useful when parsing atom feeds |
|
368 |
+ doc.remove_namespaces! unless use_namespaces? |
|
369 |
+ doc |
|
362 | 370 |
when "json" |
363 | 371 |
JSON.parse(data) |
364 | 372 |
when "html" |
@@ -366,7 +374,7 @@ module Agents |
||
366 | 374 |
when "text" |
367 | 375 |
data |
368 | 376 |
else |
369 |
- raise "Unknown extraction type #{extraction_type}" |
|
377 |
+ raise "Unknown extraction type: #{type}" |
|
370 | 378 |
end |
371 | 379 |
end |
372 | 380 |
|