11 years ago · 47eee57a99
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -23,14 +23,16 @@ module Agents
 
                 
              
 
                       To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
              
 
                 
              
 
                -      When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `"text": true` or `attr` pointing to an attribute name to grab.  An example:
              
 
                +      When parsing HTML or XML, these sub-hashes specify how each extraction should be done.  The Agent first selects a node set from the document for each extraction key by evaluating either a CSS selector in `css` or an XPath expression in `xpath`.  It then evaluates an XPath expression in `value` on each node in the node set, converting the result into string.  Here's an example:
              
 
                 
              
 
                           "extract": {
              
 
                -            "url": { "css": "#comic img", "attr": "src" },
              
 
                -            "title": { "css": "#comic img", "attr": "title" },
              
 
                -            "body_text": { "css": "div.main", "text": true }
              
 
                +            "url": { "css": "#comic img", "value": "@src" },
              
 
                +            "title": { "css": "#comic img", "value": "@title" },
              
 
                +            "body_text": { "css": "div.main", "value": ".//text()" }
              
 
                           }
              
 
                 
              
 
                +      "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts.  You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc.  Note that these functions take a string, not a node set, so what you may think would be written as `normalize-text(.//text())` should actually be `normalize-text(.)`.
              
 
                +
              
 
                       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
              
 
                 
              
 
                           "extract": {
              
@@ -70,9 +72,9 @@ module Agents
 
                           'type' => "html",
              
 
                           'mode' => "on_change",
              
 
                           'extract' => {
              
 
                -            'url' => { 'css' => "#comic img", 'attr' => "src" },
              
 
                -            'title' => { 'css' => "#comic img", 'attr' => "alt" },
              
 
                -            'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
              
 
                +            'url' => { 'css' => "#comic img", 'value' => "@src" },
              
 
                +            'title' => { 'css' => "#comic img", 'value' => "@alt" },
              
 
                +            'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
              
 
                           }
              
 
                       }
              
 
                     end
              
@@ -152,20 +154,21 @@ module Agents
 
                                   error '"css" or "xpath" is required for HTML or XML extraction'
              
 
                                   return
              
 
                                 end
              
 
                -                unless Nokogiri::XML::NodeSet === nodes
              
 
                +                case nodes
              
 
                +                when Nokogiri::XML::NodeSet
              
 
                +                  result = nodes.map { |node|
              
 
                +                    case value = node.xpath(extraction_details['value'])
              
 
                +                    when Float
              
 
                +                      # Node#xpath() returns any numeric value as float;
              
 
                +                      # convert it to integer as appropriate.
              
 
                +                      value = value.to_i if value.to_i == value
              
 
                +                    end
              
 
                +                    value.to_s
              
 
                +                  }
              
 
                +                else
              
 
                                   error "The result of HTML/XML extraction was not a NodeSet"
              
 
                                   return
              
 
                                 end
              
 
                -                result = nodes.map { |node|
              
 
                -                  if extraction_details['attr']
              
 
                -                    node.attr(extraction_details['attr'])
              
 
                -                  elsif extraction_details['text']
              
 
                -                    node.text()
              
 
                -                  else
              
 
                -                    error '"attr" or "text" is required on HTML or XML extraction patterns'
              
 
                -                    return
              
 
                -                  end
              
 
                -                }
              
 
                                 log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
              
 
                               end
              
 
                               output[name] = result
              
--- a/db/migrate/20140723110551_adopt_xpath_in_website_agent.rb
+++ b/db/migrate/20140723110551_adopt_xpath_in_website_agent.rb
@@ -0,0 +1,30 @@
 
                +class AdoptXpathInWebsiteAgent < ActiveRecord::Migration
              
 
                +  class Agent < ActiveRecord::Base
              
 
                +    include JSONSerializedField
              
 
                +    json_serialize :options
              
 
                +  end
              
 
                +
              
 
                +  def up
              
 
                +    Agent.where(type: 'Agents::WebsiteAgent').each do |agent|
              
 
                +      extract = agent.options['extract']
              
 
                +      next unless extract.is_a?(Hash) && extract.all? { |name, detail|
              
 
                +        detail.key?('xpath') || detail.key?('css')
              
 
                +      }
              
 
                +
              
 
                +      agent.options_will_change!
              
 
                +      agent.options['extract'].each { |name, extraction|
              
 
                +        case
              
 
                +        when extraction.delete('text')
              
 
                +          extraction['value'] = './/text()'
              
 
                +        when attr = extraction.delete('attr')
              
 
                +          extraction['value'] = "@#{attr}"
              
 
                +        end
              
 
                +      }
              
 
                +      agent.save!
              
 
                +    end
              
 
                +  end
              
 
                +
              
 
                +  def down
              
 
                +    raise ActiveRecord::IrreversibleMigration, "Cannot revert this migration"
              
 
                +  end
              
 
                +end
              
--- a/spec/fixtures/agents.yml
+++ b/spec/fixtures/agents.yml
@@ -10,8 +10,8 @@ jane_website_agent:
 
                                  :expected_update_period_in_days => 2,
              
 
                                  :mode => :on_change,
              
 
                                  :extract => {
              
 
                -                     :title => {:css => "item title", :text => true},
              
 
                -                     :url => {:css => "item link", :text => true}
              
 
                +                     :title => {:css => "item title", :value => './/text()'},
              
 
                +                     :url => {:css => "item link", :value => './/text()'}
              
 
                                  }
              
 
                                }.to_json.inspect %>
              
 
                 
              
@@ -27,8 +27,8 @@ bob_website_agent:
 
                                  :expected_update_period_in_days => 2,
              
 
                                  :mode => :on_change,
              
 
                                  :extract => {
              
 
                -                   :url => {:css => "#comic img", :attr => "src"},
              
 
                -                   :title => {:css => "#comic img", :attr => "title"}
              
 
                +                   :url => {:css => "#comic img", :value => "@src"},
              
 
                +                   :title => {:css => "#comic img", :value => "@title"}
              
 
                                  }
              
 
                                }.to_json.inspect %>
              
 
                 
              
--- a/spec/models/agent_spec.rb
+++ b/spec/models/agent_spec.rb
@@ -768,8 +768,8 @@ describe AgentDrop do
 
                         url: 'http://dilbert.com/',
              
 
                         mode: 'on_change',
              
 
                         extract: {
              
 
                -          url: { css: '[id^=strip_enlarged_] img', attr: 'src' },
              
 
                -          title: { css: '.STR_DateStrip', text: true },
              
 
                +          url: { css: '[id^=strip_enlarged_] img', value: '@src' },
              
 
                +          title: { css: '.STR_DateStrip', value: './/text()' },
              
 
                         },
              
 
                       },
              
 
                       schedule: 'every_12h',
              
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -11,9 +11,9 @@ describe Agents::WebsiteAgent do
 
                         'url' => "http://xkcd.com",
              
 
                         'mode' => 'on_change',
              
 
                         'extract' => {
              
 
                -          'url' => { 'css' => "#comic img", 'attr' => "src" },
              
 
                -          'title' => { 'css' => "#comic img", 'attr' => "alt" },
              
 
                -          'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
              
 
                +          'url' => { 'css' => "#comic img", 'value' => "@src" },
              
 
                +          'title' => { 'css' => "#comic img", 'value' => "@alt" },
              
 
                +          'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
              
 
                         }
              
 
                       }
              
 
                       @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @valid_options, :keep_events_for => 2)
              
@@ -256,8 +256,7 @@ describe Agents::WebsiteAgent do
 
                           'url' => "http://xkcd.com",
              
 
                           'mode' => "on_change",
              
 
                           'extract' => {
              
 
                -            'url' => {'css' => "#topLeft a", 'attr' => "href"},
              
 
                -            'title' => {'css' => "#topLeft a", 'text' => "true"}
              
 
                +            'url' => {'css' => "#topLeft a", 'value' => "@href"},
              
 
                           }
              
 
                         }
              
 
                         rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
              
@@ -268,6 +267,44 @@ describe Agents::WebsiteAgent do
 
                         event.payload['url'].should == "http://xkcd.com/about"
              
 
                       end
              
 
                 
              
 
                +      it "should return an integer value if XPath evaluates to one" do
              
 
                +        rel_site = {
              
 
                +          'name' => "XKCD",
              
 
                +          'expected_update_period_in_days' => 2,
              
 
                +          'type' => "html",
              
 
                +          'url' => "http://xkcd.com",
              
 
                +          'mode' => "on_change",
              
 
                +          'extract' => {
              
 
                +            'num_links' => {'css' => "#comicLinks", 'value' => "count(./a)"}
              
 
                +          }
              
 
                +        }
              
 
                +        rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
              
 
                +        rel.user = users(:bob)
              
 
                +        rel.save!
              
 
                +        rel.check
              
 
                +        event = Event.last
              
 
                +        event.payload['num_links'].should == "9"
              
 
                +      end
              
 
                +
              
 
                +      it "should return all texts concatenated if XPath returns many text nodes" do
              
 
                +        rel_site = {
              
 
                +          'name' => "XKCD",
              
 
                +          'expected_update_period_in_days' => 2,
              
 
                +          'type' => "html",
              
 
                +          'url' => "http://xkcd.com",
              
 
                +          'mode' => "on_change",
              
 
                +          'extract' => {
              
 
                +            'slogan' => {'css' => "#slogan", 'value' => ".//text()"}
              
 
                +          }
              
 
                +        }
              
 
                +        rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
              
 
                +        rel.user = users(:bob)
              
 
                +        rel.save!
              
 
                +        rel.check
              
 
                +        event = Event.last
              
 
                +        event.payload['slogan'].should == "A webcomic of romance, sarcasm, math, and language."
              
 
                +      end
              
 
                +
              
 
                       describe "JSON" do
              
 
                         it "works with paths" do
              
 
                           json = {
              
@@ -389,9 +426,9 @@ describe Agents::WebsiteAgent do
 
                         'url' => "http://www.example.com",
              
 
                         'mode' => 'on_change',
              
 
                         'extract' => {
              
 
                -          'url' => { 'css' => "#comic img", 'attr' => "src" },
              
 
                -          'title' => { 'css' => "#comic img", 'attr' => "alt" },
              
 
                -          'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
              
 
                +          'url' => { 'css' => "#comic img", 'value' => "@src" },
              
 
                +          'title' => { 'css' => "#comic img", 'value' => "@alt" },
              
 
                +          'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
              
 
                         },
              
 
                         'basic_auth' => "user:pass"
              
 
                       }
              
@@ -421,7 +458,7 @@ describe Agents::WebsiteAgent do
 
                         'mode' => 'on_change',
              
 
                         'headers' => { 'foo' => 'bar' },
              
 
                         'extract' => {
              
 
                -          'url' => { 'css' => "#comic img", 'attr' => "src" },
              
 
                +          'url' => { 'css' => "#comic img", 'value' => "@src" },
              
 
                         }
              
 
                       }
              
 
                       @checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options)