Merge pull request #213 from knu/website_agent-xpath

Add :xpath support to WebsiteAgent.

Andrew Cantino 10 gadi atpakaļ
vecāks
revīzija
136cbf5c56
2 mainītis faili ar 46 papildinājumiem un 18 dzēšanām
  1. 32 17
      app/models/agents/website_agent.rb
  2. 14 1
      spec/models/agents/website_agent_spec.rb

+ 32 - 17
app/models/agents/website_agent.rb

@@ -20,7 +20,7 @@ module Agents
20 20
 
21 21
       To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
22 22
 
23
-      When parsing HTML or XML, these sub-hashes specify how to extract with a `css` CSS selector and either `'text': true` or `attr` pointing to an attribute name to grab.  An example:
23
+      When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `'text': true` or `attr` pointing to an attribute name to grab.  An example:
24 24
 
25 25
           'extract': {
26 26
             'url': { 'css': "#comic img", 'attr': "src" },
@@ -109,21 +109,36 @@ module Agents
109 109
         else
110 110
           output = {}
111 111
           options['extract'].each do |name, extraction_details|
112
-            result = if extraction_type == "json"
113
-                       output[name] = Utils.values_at(doc, extraction_details['path'])
114
-                     else
115
-                       output[name] = doc.css(extraction_details['css']).map { |node|
116
-                         if extraction_details['attr']
117
-                           node.attr(extraction_details['attr'])
118
-                         elsif extraction_details['text']
119
-                           node.text()
120
-                         else
121
-                           error "'attr' or 'text' is required on HTML or XML extraction patterns"
122
-                           return
123
-                         end
124
-                       }
125
-                     end
126
-            log "Extracting #{extraction_type} at #{extraction_details['path'] || extraction_details['css']}: #{result}"
112
+            if extraction_type == "json"
113
+              result = Utils.values_at(doc, extraction_details['path'])
114
+              log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
115
+            else
116
+              case
117
+              when css = extraction_details['css']
118
+                nodes = doc.css(css)
119
+              when xpath = extraction_details['xpath']
120
+                nodes = doc.xpath(xpath)
121
+              else
122
+                error "'css' or 'xpath' is required for HTML or XML extraction"
123
+                return
124
+              end
125
+              unless Nokogiri::XML::NodeSet === nodes
126
+                error "The result of HTML/XML extraction was not a NodeSet"
127
+                return
128
+              end
129
+              result = nodes.map { |node|
130
+                if extraction_details['attr']
131
+                  node.attr(extraction_details['attr'])
132
+                elsif extraction_details['text']
133
+                  node.text()
134
+                else
135
+                  error "'attr' or 'text' is required on HTML or XML extraction patterns"
136
+                  return
137
+                end
138
+              }
139
+              log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
140
+            end
141
+            output[name] = result
127 142
           end
128 143
 
129 144
           num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
@@ -228,4 +243,4 @@ module Agents
228 243
       end
229 244
     end
230 245
   end
231
-end
246
+end

+ 14 - 1
spec/models/agents/website_agent_spec.rb

@@ -114,6 +114,19 @@ describe Agents::WebsiteAgent do
114 114
         event.payload['hovertext'].should =~ /^Biologists play reverse/
115 115
       end
116 116
 
117
+      it "parses XPath" do
118
+        @site['extract'].each { |key, value|
119
+          value.delete('css')
120
+          value['xpath'] = "//*[@id='comic']//img"
121
+        }
122
+        @checker.options = @site
123
+        @checker.check
124
+        event = Event.last
125
+        event.payload['url'].should == "http://imgs.xkcd.com/comics/evolving.png"
126
+        event.payload['title'].should == "Evolving"
127
+        event.payload['hovertext'].should =~ /^Biologists play reverse/
128
+      end
129
+
117 130
       it "should turn relative urls to absolute" do
118 131
         rel_site = {
119 132
           'name' => "XKCD",
@@ -258,4 +271,4 @@ describe Agents::WebsiteAgent do
258 271
       end
259 272
     end
260 273
   end
261
-end
274
+end