@@ -33,6 +33,8 @@ module Agents |
||
| 33 | 33 |
|
| 34 | 34 |
"@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts. You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc. Note that these functions take a string, not a node set, so what you may think would be written as `normalize-space(.//text())` should actually be `normalize-space(.)`. |
| 35 | 35 |
|
| 36 |
+ Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document unless a toplevel option `use_namespaces` is set to true. |
|
| 37 |
+ |
|
| 36 | 38 |
When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example: |
| 37 | 39 |
|
| 38 | 40 |
"extract": {
|
@@ -299,14 +301,24 @@ module Agents |
||
| 299 | 301 |
end).to_s |
| 300 | 302 |
end |
| 301 | 303 |
|
| 302 |
- def extract_each(doc, &block) |
|
| 304 |
+ def use_namespaces? |
|
| 305 |
+ if value = interpolated.key?('use_namespaces')
|
|
| 306 |
+ boolify(interpolated['use_namespaces']) |
|
| 307 |
+ else |
|
| 308 |
+ interpolated['extract'].none? { |name, extraction_details|
|
|
| 309 |
+ extraction_details.key?('xpath')
|
|
| 310 |
+ } |
|
| 311 |
+ end |
|
| 312 |
+ end |
|
| 313 |
+ |
|
| 314 |
+ def extract_each(&block) |
|
| 303 | 315 |
interpolated['extract'].each_with_object({}) { |(name, extraction_details), output|
|
| 304 | 316 |
output[name] = block.call(extraction_details) |
| 305 | 317 |
} |
| 306 | 318 |
end |
| 307 | 319 |
|
| 308 | 320 |
def extract_json(doc) |
| 309 |
- extract_each(doc) { |extraction_details|
|
|
| 321 |
+ extract_each { |extraction_details|
|
|
| 310 | 322 |
result = Utils.values_at(doc, extraction_details['path']) |
| 311 | 323 |
log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
|
| 312 | 324 |
result |
@@ -314,7 +326,7 @@ module Agents |
||
| 314 | 326 |
end |
| 315 | 327 |
|
| 316 | 328 |
def extract_text(doc) |
| 317 |
- extract_each(doc) { |extraction_details|
|
|
| 329 |
+ extract_each { |extraction_details|
|
|
| 318 | 330 |
regexp = Regexp.new(extraction_details['regexp']) |
| 319 | 331 |
result = [] |
| 320 | 332 |
doc.scan(regexp) {
|
@@ -326,12 +338,11 @@ module Agents |
||
| 326 | 338 |
end |
| 327 | 339 |
|
| 328 | 340 |
def extract_xml(doc) |
| 329 |
- extract_each(doc) { |extraction_details|
|
|
| 341 |
+ extract_each { |extraction_details|
|
|
| 330 | 342 |
case |
| 331 | 343 |
when css = extraction_details['css'] |
| 332 | 344 |
nodes = doc.css(css) |
| 333 | 345 |
when xpath = extraction_details['xpath'] |
| 334 |
- doc.remove_namespaces! # ignore xmlns, useful when parsing atom feeds |
|
| 335 | 346 |
nodes = doc.xpath(xpath) |
| 336 | 347 |
else |
| 337 | 348 |
raise '"css" or "xpath" is required for HTML or XML extraction' |
@@ -356,9 +367,12 @@ module Agents |
||
| 356 | 367 |
end |
| 357 | 368 |
|
| 358 | 369 |
def parse(data) |
| 359 |
- case extraction_type |
|
| 370 |
+ case type = extraction_type |
|
| 360 | 371 |
when "xml" |
| 361 |
- Nokogiri::XML(data) |
|
| 372 |
+ doc = Nokogiri::XML(data) |
|
| 373 |
+ # ignore xmlns, useful when parsing atom feeds |
|
| 374 |
+ doc.remove_namespaces! unless use_namespaces? |
|
| 375 |
+ doc |
|
| 362 | 376 |
when "json" |
| 363 | 377 |
JSON.parse(data) |
| 364 | 378 |
when "html" |
@@ -366,7 +380,7 @@ module Agents |
||
| 366 | 380 |
when "text" |
| 367 | 381 |
data |
| 368 | 382 |
else |
| 369 |
- raise "Unknown extraction type #{extraction_type}"
|
|
| 383 |
+ raise "Unknown extraction type: #{type}"
|
|
| 370 | 384 |
end |
| 371 | 385 |
end |
| 372 | 386 |
|
@@ -368,6 +368,108 @@ describe Agents::WebsiteAgent do |
||
| 368 | 368 |
expect(event.payload['response_info']).to eq('The reponse was 200 OK.')
|
| 369 | 369 |
end |
| 370 | 370 |
|
| 371 |
+ describe "XML" do |
|
| 372 |
+ before do |
|
| 373 |
+ stub_request(:any, /github_rss/).to_return( |
|
| 374 |
+ body: File.read(Rails.root.join("spec/data_fixtures/github_rss.atom")),
|
|
| 375 |
+ status: 200 |
|
| 376 |
+ ) |
|
| 377 |
+ |
|
| 378 |
+ @checker = Agents::WebsiteAgent.new(name: 'github', options: {
|
|
| 379 |
+ 'name' => 'GitHub', |
|
| 380 |
+ 'expected_update_period_in_days' => '2', |
|
| 381 |
+ 'type' => 'xml', |
|
| 382 |
+ 'url' => 'http://example.com/github_rss.atom', |
|
| 383 |
+ 'mode' => 'on_change', |
|
| 384 |
+ 'extract' => {
|
|
| 385 |
+ 'title' => { 'xpath' => '/feed/entry', 'value' => 'normalize-space(./title)' },
|
|
| 386 |
+ 'url' => { 'xpath' => '/feed/entry', 'value' => './link[1]/@href' },
|
|
| 387 |
+ 'thumbnail' => { 'xpath' => '/feed/entry', 'value' => './thumbnail/@url' },
|
|
| 388 |
+ } |
|
| 389 |
+ }, keep_events_for: 2) |
|
| 390 |
+ @checker.user = users(:bob) |
|
| 391 |
+ @checker.save! |
|
| 392 |
+ end |
|
| 393 |
+ |
|
| 394 |
+ it "works with XPath" do |
|
| 395 |
+ expect {
|
|
| 396 |
+ @checker.check |
|
| 397 |
+ }.to change { Event.count }.by(20)
|
|
| 398 |
+ event = Event.last |
|
| 399 |
+ expect(event.payload['title']).to eq('Shift to dev group')
|
|
| 400 |
+ expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
|
|
| 401 |
+ expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
|
|
| 402 |
+ end |
|
| 403 |
+ |
|
| 404 |
+ it "works with XPath with namespaces unstripped" do |
|
| 405 |
+ @checker.options['use_namespaces'] = 'true' |
|
| 406 |
+ @checker.save! |
|
| 407 |
+ expect {
|
|
| 408 |
+ @checker.check |
|
| 409 |
+ }.to change { Event.count }.by(0)
|
|
| 410 |
+ |
|
| 411 |
+ @checker.options['extract'] = {
|
|
| 412 |
+ 'title' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => 'normalize-space(./xmlns:title)' },
|
|
| 413 |
+ 'url' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './xmlns:link[1]/@href' },
|
|
| 414 |
+ 'thumbnail' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './media:thumbnail/@url' },
|
|
| 415 |
+ } |
|
| 416 |
+ @checker.save! |
|
| 417 |
+ expect {
|
|
| 418 |
+ @checker.check |
|
| 419 |
+ }.to change { Event.count }.by(20)
|
|
| 420 |
+ event = Event.last |
|
| 421 |
+ expect(event.payload['title']).to eq('Shift to dev group')
|
|
| 422 |
+ expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
|
|
| 423 |
+ expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
|
|
| 424 |
+ end |
|
| 425 |
+ |
|
| 426 |
+ it "works with CSS selectors" do |
|
| 427 |
+ @checker.options['extract'] = {
|
|
| 428 |
+ 'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' },
|
|
| 429 |
+ 'url' => { 'css' => 'feed > entry', 'value' => './link[1]/@href' },
|
|
| 430 |
+ 'thumbnail' => { 'css' => 'feed > entry', 'value' => './thumbnail/@url' },
|
|
| 431 |
+ } |
|
| 432 |
+ @checker.save! |
|
| 433 |
+ expect {
|
|
| 434 |
+ @checker.check |
|
| 435 |
+ }.to change { Event.count }.by(20)
|
|
| 436 |
+ event = Event.last |
|
| 437 |
+ expect(event.payload['title']).to be_empty |
|
| 438 |
+ expect(event.payload['thumbnail']).to be_empty |
|
| 439 |
+ |
|
| 440 |
+ @checker.options['extract'] = {
|
|
| 441 |
+ 'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./xmlns:title)' },
|
|
| 442 |
+ 'url' => { 'css' => 'feed > entry', 'value' => './xmlns:link[1]/@href' },
|
|
| 443 |
+ 'thumbnail' => { 'css' => 'feed > entry', 'value' => './media:thumbnail/@url' },
|
|
| 444 |
+ } |
|
| 445 |
+ @checker.save! |
|
| 446 |
+ expect {
|
|
| 447 |
+ @checker.check |
|
| 448 |
+ }.to change { Event.count }.by(20)
|
|
| 449 |
+ event = Event.last |
|
| 450 |
+ expect(event.payload['title']).to eq('Shift to dev group')
|
|
| 451 |
+ expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
|
|
| 452 |
+ expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
|
|
| 453 |
+ end |
|
| 454 |
+ |
|
| 455 |
+ it "works with CSS selectors with namespaces stripped" do |
|
| 456 |
+ @checker.options['extract'] = {
|
|
| 457 |
+ 'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' },
|
|
| 458 |
+ 'url' => { 'css' => 'feed > entry', 'value' => './link[1]/@href' },
|
|
| 459 |
+ 'thumbnail' => { 'css' => 'feed > entry', 'value' => './thumbnail/@url' },
|
|
| 460 |
+ } |
|
| 461 |
+ @checker.options['use_namespaces'] = 'false' |
|
| 462 |
+ @checker.save! |
|
| 463 |
+ expect {
|
|
| 464 |
+ @checker.check |
|
| 465 |
+ }.to change { Event.count }.by(20)
|
|
| 466 |
+ event = Event.last |
|
| 467 |
+ expect(event.payload['title']).to eq('Shift to dev group')
|
|
| 468 |
+ expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
|
|
| 469 |
+ expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
|
|
| 470 |
+ end |
|
| 471 |
+ end |
|
| 472 |
+ |
|
| 371 | 473 |
describe "JSON" do |
| 372 | 474 |
it "works with paths" do |
| 373 | 475 |
json = {
|