Merge branch 'website_agent-use_namespaces'

This is a part of #619.

Akinori MUSHA 9 years ago
parent
commit
ffe9b38580
2 changed files with 124 additions and 8 deletions
  1. 22 8
      app/models/agents/website_agent.rb
  2. 102 0
      spec/models/agents/website_agent_spec.rb

+ 22 - 8
app/models/agents/website_agent.rb

@@ -33,6 +33,8 @@ module Agents
33 33
 
34 34
       "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts.  You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc.  Note that these functions take a string, not a node set, so what you may think would be written as `normalize-space(.//text())` should actually be `normalize-space(.)`.
35 35
 
36
+      Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document unless a toplevel option `use_namespaces` is set to true.
37
+
36 38
       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
37 39
 
38 40
           "extract": {
@@ -299,14 +301,24 @@ module Agents
299 301
       end).to_s
300 302
     end
301 303
 
302
-    def extract_each(doc, &block)
304
+    def use_namespaces?
305
+      if value = interpolated.key?('use_namespaces')
306
+        boolify(interpolated['use_namespaces'])
307
+      else
308
+        interpolated['extract'].none? { |name, extraction_details|
309
+          extraction_details.key?('xpath')
310
+        }
311
+      end
312
+    end
313
+
314
+    def extract_each(&block)
303 315
       interpolated['extract'].each_with_object({}) { |(name, extraction_details), output|
304 316
         output[name] = block.call(extraction_details)
305 317
       }
306 318
     end
307 319
 
308 320
     def extract_json(doc)
309
-      extract_each(doc) { |extraction_details|
321
+      extract_each { |extraction_details|
310 322
         result = Utils.values_at(doc, extraction_details['path'])
311 323
         log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
312 324
         result
@@ -314,7 +326,7 @@ module Agents
314 326
     end
315 327
 
316 328
     def extract_text(doc)
317
-      extract_each(doc) { |extraction_details|
329
+      extract_each { |extraction_details|
318 330
         regexp = Regexp.new(extraction_details['regexp'])
319 331
         result = []
320 332
         doc.scan(regexp) {
@@ -326,12 +338,11 @@ module Agents
326 338
     end
327 339
 
328 340
     def extract_xml(doc)
329
-      extract_each(doc) { |extraction_details|
341
+      extract_each { |extraction_details|
330 342
         case
331 343
         when css = extraction_details['css']
332 344
           nodes = doc.css(css)
333 345
         when xpath = extraction_details['xpath']
334
-          doc.remove_namespaces! # ignore xmlns, useful when parsing atom feeds
335 346
           nodes = doc.xpath(xpath)
336 347
         else
337 348
           raise '"css" or "xpath" is required for HTML or XML extraction'
@@ -356,9 +367,12 @@ module Agents
356 367
     end
357 368
 
358 369
     def parse(data)
359
-      case extraction_type
370
+      case type = extraction_type
360 371
       when "xml"
361
-        Nokogiri::XML(data)
372
+        doc = Nokogiri::XML(data)
373
+        # ignore xmlns, useful when parsing atom feeds
374
+        doc.remove_namespaces! unless use_namespaces?
375
+        doc
362 376
       when "json"
363 377
         JSON.parse(data)
364 378
       when "html"
@@ -366,7 +380,7 @@ module Agents
366 380
       when "text"
367 381
         data
368 382
       else
369
-        raise "Unknown extraction type #{extraction_type}"
383
+        raise "Unknown extraction type: #{type}"
370 384
       end
371 385
     end
372 386
 

+ 102 - 0
spec/models/agents/website_agent_spec.rb

@@ -368,6 +368,108 @@ describe Agents::WebsiteAgent do
368 368
         expect(event.payload['response_info']).to eq('The reponse was 200 OK.')
369 369
       end
370 370
 
371
+      describe "XML" do
372
+        before do
373
+          stub_request(:any, /github_rss/).to_return(
374
+            body: File.read(Rails.root.join("spec/data_fixtures/github_rss.atom")),
375
+            status: 200
376
+          )
377
+
378
+          @checker = Agents::WebsiteAgent.new(name: 'github', options: {
379
+            'name' => 'GitHub',
380
+            'expected_update_period_in_days' => '2',
381
+            'type' => 'xml',
382
+            'url' => 'http://example.com/github_rss.atom',
383
+            'mode' => 'on_change',
384
+            'extract' => {
385
+              'title' => { 'xpath' => '/feed/entry', 'value' => 'normalize-space(./title)' },
386
+              'url' => { 'xpath' => '/feed/entry', 'value' => './link[1]/@href' },
387
+              'thumbnail' => { 'xpath' => '/feed/entry', 'value' => './thumbnail/@url' },
388
+            }
389
+          }, keep_events_for: 2)
390
+          @checker.user = users(:bob)
391
+          @checker.save!
392
+        end
393
+
394
+        it "works with XPath" do
395
+          expect {
396
+            @checker.check
397
+          }.to change { Event.count }.by(20)
398
+          event = Event.last
399
+          expect(event.payload['title']).to eq('Shift to dev group')
400
+          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
401
+          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
402
+        end
403
+
404
+        it "works with XPath with namespaces unstripped" do
405
+          @checker.options['use_namespaces'] = 'true'
406
+          @checker.save!
407
+          expect {
408
+            @checker.check
409
+          }.to change { Event.count }.by(0)
410
+
411
+          @checker.options['extract'] = {
412
+            'title' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => 'normalize-space(./xmlns:title)' },
413
+            'url' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './xmlns:link[1]/@href' },
414
+            'thumbnail' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './media:thumbnail/@url' },
415
+          }
416
+          @checker.save!
417
+          expect {
418
+            @checker.check
419
+          }.to change { Event.count }.by(20)
420
+          event = Event.last
421
+          expect(event.payload['title']).to eq('Shift to dev group')
422
+          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
423
+          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
424
+        end
425
+
426
+        it "works with CSS selectors" do
427
+          @checker.options['extract'] = {
428
+            'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' },
429
+            'url' => { 'css' => 'feed > entry', 'value' => './link[1]/@href' },
430
+            'thumbnail' => { 'css' => 'feed > entry', 'value' => './thumbnail/@url' },
431
+          }
432
+          @checker.save!
433
+          expect {
434
+            @checker.check
435
+          }.to change { Event.count }.by(20)
436
+          event = Event.last
437
+          expect(event.payload['title']).to be_empty
438
+          expect(event.payload['thumbnail']).to be_empty
439
+
440
+          @checker.options['extract'] = {
441
+            'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./xmlns:title)' },
442
+            'url' => { 'css' => 'feed > entry', 'value' => './xmlns:link[1]/@href' },
443
+            'thumbnail' => { 'css' => 'feed > entry', 'value' => './media:thumbnail/@url' },
444
+          }
445
+          @checker.save!
446
+          expect {
447
+            @checker.check
448
+          }.to change { Event.count }.by(20)
449
+          event = Event.last
450
+          expect(event.payload['title']).to eq('Shift to dev group')
451
+          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
452
+          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
453
+        end
454
+
455
+        it "works with CSS selectors with namespaces stripped" do
456
+          @checker.options['extract'] = {
457
+            'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' },
458
+            'url' => { 'css' => 'feed > entry', 'value' => './link[1]/@href' },
459
+            'thumbnail' => { 'css' => 'feed > entry', 'value' => './thumbnail/@url' },
460
+          }
461
+          @checker.options['use_namespaces'] = 'false'
462
+          @checker.save!
463
+          expect {
464
+            @checker.check
465
+          }.to change { Event.count }.by(20)
466
+          event = Event.last
467
+          expect(event.payload['title']).to eq('Shift to dev group')
468
+          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
469
+          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
470
+        end
471
+      end
472
+
371 473
       describe "JSON" do
372 474
         it "works with paths" do
373 475
           json = {