10 jaren geleden · ffe9b38580
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -33,6 +33,8 @@ module Agents
 
                 
              
 
                       "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts.  You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc.  Note that these functions take a string, not a node set, so what you may think would be written as `normalize-space(.//text())` should actually be `normalize-space(.)`.
              
 
                 
              
 
                +      Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document unless a toplevel option `use_namespaces` is set to true.
              
 
                +
              
 
                       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
              
 
                 
              
 
                           "extract": {
              
@@ -299,14 +301,24 @@ module Agents
 
                       end).to_s
              
 
                     end
              
 
                 
              
 
                -    def extract_each(doc, &block)
              
 
                +    def use_namespaces?
              
 
                +      if value = interpolated.key?('use_namespaces')
              
 
                +        boolify(interpolated['use_namespaces'])
              
 
                +      else
              
 
                +        interpolated['extract'].none? { |name, extraction_details|
              
 
                +          extraction_details.key?('xpath')
              
 
                +        }
              
 
                +      end
              
 
                +    end
              
 
                +
              
 
                +    def extract_each(&block)
              
 
                       interpolated['extract'].each_with_object({}) { |(name, extraction_details), output|
              
 
                         output[name] = block.call(extraction_details)
              
 
                       }
              
 
                     end
              
 
                 
              
 
                     def extract_json(doc)
              
 
                -      extract_each(doc) { |extraction_details|
              
 
                +      extract_each { |extraction_details|
              
 
                         result = Utils.values_at(doc, extraction_details['path'])
              
 
                         log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
              
 
                         result
              
@@ -314,7 +326,7 @@ module Agents
 
                     end
              
 
                 
              
 
                     def extract_text(doc)
              
 
                -      extract_each(doc) { |extraction_details|
              
 
                +      extract_each { |extraction_details|
              
 
                         regexp = Regexp.new(extraction_details['regexp'])
              
 
                         result = []
              
 
                         doc.scan(regexp) {
              
@@ -326,12 +338,11 @@ module Agents
 
                     end
              
 
                 
              
 
                     def extract_xml(doc)
              
 
                -      extract_each(doc) { |extraction_details|
              
 
                +      extract_each { |extraction_details|
              
 
                         case
              
 
                         when css = extraction_details['css']
              
 
                           nodes = doc.css(css)
              
 
                         when xpath = extraction_details['xpath']
              
 
                -          doc.remove_namespaces! # ignore xmlns, useful when parsing atom feeds
              
 
                           nodes = doc.xpath(xpath)
              
 
                         else
              
 
                           raise '"css" or "xpath" is required for HTML or XML extraction'
              
@@ -356,9 +367,12 @@ module Agents
 
                     end
              
 
                 
              
 
                     def parse(data)
              
 
                -      case extraction_type
              
 
                +      case type = extraction_type
              
 
                       when "xml"
              
 
                -        Nokogiri::XML(data)
              
 
                +        doc = Nokogiri::XML(data)
              
 
                +        # ignore xmlns, useful when parsing atom feeds
              
 
                +        doc.remove_namespaces! unless use_namespaces?
              
 
                +        doc
              
 
                       when "json"
              
 
                         JSON.parse(data)
              
 
                       when "html"
              
@@ -366,7 +380,7 @@ module Agents
 
                       when "text"
              
 
                         data
              
 
                       else
              
 
                -        raise "Unknown extraction type #{extraction_type}"
              
 
                +        raise "Unknown extraction type: #{type}"
              
 
                       end
              
 
                     end
              
 
                 
              
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -368,6 +368,108 @@ describe Agents::WebsiteAgent do
 
                         expect(event.payload['response_info']).to eq('The reponse was 200 OK.')
              
 
                       end
              
 
                 
              
 
                +      describe "XML" do
              
 
                +        before do
              
 
                +          stub_request(:any, /github_rss/).to_return(
              
 
                +            body: File.read(Rails.root.join("spec/data_fixtures/github_rss.atom")),
              
 
                +            status: 200
              
 
                +          )
              
 
                +
              
 
                +          @checker = Agents::WebsiteAgent.new(name: 'github', options: {
              
 
                +            'name' => 'GitHub',
              
 
                +            'expected_update_period_in_days' => '2',
              
 
                +            'type' => 'xml',
              
 
                +            'url' => 'http://example.com/github_rss.atom',
              
 
                +            'mode' => 'on_change',
              
 
                +            'extract' => {
              
 
                +              'title' => { 'xpath' => '/feed/entry', 'value' => 'normalize-space(./title)' },
              
 
                +              'url' => { 'xpath' => '/feed/entry', 'value' => './link[1]/@href' },
              
 
                +              'thumbnail' => { 'xpath' => '/feed/entry', 'value' => './thumbnail/@url' },
              
 
                +            }
              
 
                +          }, keep_events_for: 2)
              
 
                +          @checker.user = users(:bob)
              
 
                +          @checker.save!
              
 
                +        end
              
 
                +
              
 
                +        it "works with XPath" do
              
 
                +          expect {
              
 
                +            @checker.check
              
 
                +          }.to change { Event.count }.by(20)
              
 
                +          event = Event.last
              
 
                +          expect(event.payload['title']).to eq('Shift to dev group')
              
 
                +          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
              
 
                +          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
              
 
                +        end
              
 
                +
              
 
                +        it "works with XPath with namespaces unstripped" do
              
 
                +          @checker.options['use_namespaces'] = 'true'
              
 
                +          @checker.save!
              
 
                +          expect {
              
 
                +            @checker.check
              
 
                +          }.to change { Event.count }.by(0)
              
 
                +
              
 
                +          @checker.options['extract'] = {
              
 
                +            'title' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => 'normalize-space(./xmlns:title)' },
              
 
                +            'url' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './xmlns:link[1]/@href' },
              
 
                +            'thumbnail' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './media:thumbnail/@url' },
              
 
                +          }
              
 
                +          @checker.save!
              
 
                +          expect {
              
 
                +            @checker.check
              
 
                +          }.to change { Event.count }.by(20)
              
 
                +          event = Event.last
              
 
                +          expect(event.payload['title']).to eq('Shift to dev group')
              
 
                +          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
              
 
                +          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
              
 
                +        end
              
 
                +
              
 
                +        it "works with CSS selectors" do
              
 
                +          @checker.options['extract'] = {
              
 
                +            'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' },
              
 
                +            'url' => { 'css' => 'feed > entry', 'value' => './link[1]/@href' },
              
 
                +            'thumbnail' => { 'css' => 'feed > entry', 'value' => './thumbnail/@url' },
              
 
                +          }
              
 
                +          @checker.save!
              
 
                +          expect {
              
 
                +            @checker.check
              
 
                +          }.to change { Event.count }.by(20)
              
 
                +          event = Event.last
              
 
                +          expect(event.payload['title']).to be_empty
              
 
                +          expect(event.payload['thumbnail']).to be_empty
              
 
                +
              
 
                +          @checker.options['extract'] = {
              
 
                +            'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./xmlns:title)' },
              
 
                +            'url' => { 'css' => 'feed > entry', 'value' => './xmlns:link[1]/@href' },
              
 
                +            'thumbnail' => { 'css' => 'feed > entry', 'value' => './media:thumbnail/@url' },
              
 
                +          }
              
 
                +          @checker.save!
              
 
                +          expect {
              
 
                +            @checker.check
              
 
                +          }.to change { Event.count }.by(20)
              
 
                +          event = Event.last
              
 
                +          expect(event.payload['title']).to eq('Shift to dev group')
              
 
                +          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
              
 
                +          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
              
 
                +        end
              
 
                +
              
 
                +        it "works with CSS selectors with namespaces stripped" do
              
 
                +          @checker.options['extract'] = {
              
 
                +            'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' },
              
 
                +            'url' => { 'css' => 'feed > entry', 'value' => './link[1]/@href' },
              
 
                +            'thumbnail' => { 'css' => 'feed > entry', 'value' => './thumbnail/@url' },
              
 
                +          }
              
 
                +          @checker.options['use_namespaces'] = 'false'
              
 
                +          @checker.save!
              
 
                +          expect {
              
 
                +            @checker.check
              
 
                +          }.to change { Event.count }.by(20)
              
 
                +          event = Event.last
              
 
                +          expect(event.payload['title']).to eq('Shift to dev group')
              
 
                +          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
              
 
                +          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
              
 
                +        end
              
 
                +      end
              
 
                +
              
 
                       describe "JSON" do
              
 
                         it "works with paths" do
              
 
                           json = {