Merge pull request #412 from knu/website_agent-per_node_xpath

WebsiteAgent: Introduce per-node XPath evaluation in extraction.

Andrew Cantino 10 years ago
parent
commit
47eee57a99

+ 21 - 18
app/models/agents/website_agent.rb

@@ -23,14 +23,16 @@ module Agents
23 23
 
24 24
       To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
25 25
 
26
-      When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `"text": true` or `attr` pointing to an attribute name to grab.  An example:
26
+      When parsing HTML or XML, these sub-hashes specify how each extraction should be done.  The Agent first selects a node set from the document for each extraction key by evaluating either a CSS selector in `css` or an XPath expression in `xpath`.  It then evaluates an XPath expression in `value` on each node in the node set, converting the result into string.  Here's an example:
27 27
 
28 28
           "extract": {
29
-            "url": { "css": "#comic img", "attr": "src" },
30
-            "title": { "css": "#comic img", "attr": "title" },
31
-            "body_text": { "css": "div.main", "text": true }
29
+            "url": { "css": "#comic img", "value": "@src" },
30
+            "title": { "css": "#comic img", "value": "@title" },
31
+            "body_text": { "css": "div.main", "value": ".//text()" }
32 32
           }
33 33
 
34
+      "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts.  You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc.  Note that these functions take a string, not a node set, so what you may think would be written as `normalize-text(.//text())` should actually be `normalize-text(.)`.
35
+
34 36
       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
35 37
 
36 38
           "extract": {
@@ -70,9 +72,9 @@ module Agents
70 72
           'type' => "html",
71 73
           'mode' => "on_change",
72 74
           'extract' => {
73
-            'url' => { 'css' => "#comic img", 'attr' => "src" },
74
-            'title' => { 'css' => "#comic img", 'attr' => "alt" },
75
-            'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
75
+            'url' => { 'css' => "#comic img", 'value' => "@src" },
76
+            'title' => { 'css' => "#comic img", 'value' => "@alt" },
77
+            'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
76 78
           }
77 79
       }
78 80
     end
@@ -152,20 +154,21 @@ module Agents
152 154
                   error '"css" or "xpath" is required for HTML or XML extraction'
153 155
                   return
154 156
                 end
155
-                unless Nokogiri::XML::NodeSet === nodes
157
+                case nodes
158
+                when Nokogiri::XML::NodeSet
159
+                  result = nodes.map { |node|
160
+                    case value = node.xpath(extraction_details['value'])
161
+                    when Float
162
+                      # Node#xpath() returns any numeric value as float;
163
+                      # convert it to integer as appropriate.
164
+                      value = value.to_i if value.to_i == value
165
+                    end
166
+                    value.to_s
167
+                  }
168
+                else
156 169
                   error "The result of HTML/XML extraction was not a NodeSet"
157 170
                   return
158 171
                 end
159
-                result = nodes.map { |node|
160
-                  if extraction_details['attr']
161
-                    node.attr(extraction_details['attr'])
162
-                  elsif extraction_details['text']
163
-                    node.text()
164
-                  else
165
-                    error '"attr" or "text" is required on HTML or XML extraction patterns'
166
-                    return
167
-                  end
168
-                }
169 172
                 log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
170 173
               end
171 174
               output[name] = result

+ 30 - 0
db/migrate/20140723110551_adopt_xpath_in_website_agent.rb

@@ -0,0 +1,30 @@
1
+class AdoptXpathInWebsiteAgent < ActiveRecord::Migration
2
+  class Agent < ActiveRecord::Base
3
+    include JSONSerializedField
4
+    json_serialize :options
5
+  end
6
+
7
+  def up
8
+    Agent.where(type: 'Agents::WebsiteAgent').each do |agent|
9
+      extract = agent.options['extract']
10
+      next unless extract.is_a?(Hash) && extract.all? { |name, detail|
11
+        detail.key?('xpath') || detail.key?('css')
12
+      }
13
+
14
+      agent.options_will_change!
15
+      agent.options['extract'].each { |name, extraction|
16
+        case
17
+        when extraction.delete('text')
18
+          extraction['value'] = './/text()'
19
+        when attr = extraction.delete('attr')
20
+          extraction['value'] = "@#{attr}"
21
+        end
22
+      }
23
+      agent.save!
24
+    end
25
+  end
26
+
27
+  def down
28
+    raise ActiveRecord::IrreversibleMigration, "Cannot revert this migration"
29
+  end
30
+end

+ 4 - 4
spec/fixtures/agents.yml

@@ -10,8 +10,8 @@ jane_website_agent:
10 10
                  :expected_update_period_in_days => 2,
11 11
                  :mode => :on_change,
12 12
                  :extract => {
13
-                     :title => {:css => "item title", :text => true},
14
-                     :url => {:css => "item link", :text => true}
13
+                     :title => {:css => "item title", :value => './/text()'},
14
+                     :url => {:css => "item link", :value => './/text()'}
15 15
                  }
16 16
                }.to_json.inspect %>
17 17
 
@@ -27,8 +27,8 @@ bob_website_agent:
27 27
                  :expected_update_period_in_days => 2,
28 28
                  :mode => :on_change,
29 29
                  :extract => {
30
-                   :url => {:css => "#comic img", :attr => "src"},
31
-                   :title => {:css => "#comic img", :attr => "title"}
30
+                   :url => {:css => "#comic img", :value => "@src"},
31
+                   :title => {:css => "#comic img", :value => "@title"}
32 32
                  }
33 33
                }.to_json.inspect %>
34 34
 

+ 2 - 2
spec/models/agent_spec.rb

@@ -768,8 +768,8 @@ describe AgentDrop do
768 768
         url: 'http://dilbert.com/',
769 769
         mode: 'on_change',
770 770
         extract: {
771
-          url: { css: '[id^=strip_enlarged_] img', attr: 'src' },
772
-          title: { css: '.STR_DateStrip', text: true },
771
+          url: { css: '[id^=strip_enlarged_] img', value: '@src' },
772
+          title: { css: '.STR_DateStrip', value: './/text()' },
773 773
         },
774 774
       },
775 775
       schedule: 'every_12h',

+ 46 - 9
spec/models/agents/website_agent_spec.rb

@@ -11,9 +11,9 @@ describe Agents::WebsiteAgent do
11 11
         'url' => "http://xkcd.com",
12 12
         'mode' => 'on_change',
13 13
         'extract' => {
14
-          'url' => { 'css' => "#comic img", 'attr' => "src" },
15
-          'title' => { 'css' => "#comic img", 'attr' => "alt" },
16
-          'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
14
+          'url' => { 'css' => "#comic img", 'value' => "@src" },
15
+          'title' => { 'css' => "#comic img", 'value' => "@alt" },
16
+          'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
17 17
         }
18 18
       }
19 19
       @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @valid_options, :keep_events_for => 2)
@@ -256,8 +256,7 @@ describe Agents::WebsiteAgent do
256 256
           'url' => "http://xkcd.com",
257 257
           'mode' => "on_change",
258 258
           'extract' => {
259
-            'url' => {'css' => "#topLeft a", 'attr' => "href"},
260
-            'title' => {'css' => "#topLeft a", 'text' => "true"}
259
+            'url' => {'css' => "#topLeft a", 'value' => "@href"},
261 260
           }
262 261
         }
263 262
         rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
@@ -268,6 +267,44 @@ describe Agents::WebsiteAgent do
268 267
         event.payload['url'].should == "http://xkcd.com/about"
269 268
       end
270 269
 
270
+      it "should return an integer value if XPath evaluates to one" do
271
+        rel_site = {
272
+          'name' => "XKCD",
273
+          'expected_update_period_in_days' => 2,
274
+          'type' => "html",
275
+          'url' => "http://xkcd.com",
276
+          'mode' => "on_change",
277
+          'extract' => {
278
+            'num_links' => {'css' => "#comicLinks", 'value' => "count(./a)"}
279
+          }
280
+        }
281
+        rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
282
+        rel.user = users(:bob)
283
+        rel.save!
284
+        rel.check
285
+        event = Event.last
286
+        event.payload['num_links'].should == "9"
287
+      end
288
+
289
+      it "should return all texts concatenated if XPath returns many text nodes" do
290
+        rel_site = {
291
+          'name' => "XKCD",
292
+          'expected_update_period_in_days' => 2,
293
+          'type' => "html",
294
+          'url' => "http://xkcd.com",
295
+          'mode' => "on_change",
296
+          'extract' => {
297
+            'slogan' => {'css' => "#slogan", 'value' => ".//text()"}
298
+          }
299
+        }
300
+        rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
301
+        rel.user = users(:bob)
302
+        rel.save!
303
+        rel.check
304
+        event = Event.last
305
+        event.payload['slogan'].should == "A webcomic of romance, sarcasm, math, and language."
306
+      end
307
+
271 308
       describe "JSON" do
272 309
         it "works with paths" do
273 310
           json = {
@@ -389,9 +426,9 @@ describe Agents::WebsiteAgent do
389 426
         'url' => "http://www.example.com",
390 427
         'mode' => 'on_change',
391 428
         'extract' => {
392
-          'url' => { 'css' => "#comic img", 'attr' => "src" },
393
-          'title' => { 'css' => "#comic img", 'attr' => "alt" },
394
-          'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
429
+          'url' => { 'css' => "#comic img", 'value' => "@src" },
430
+          'title' => { 'css' => "#comic img", 'value' => "@alt" },
431
+          'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
395 432
         },
396 433
         'basic_auth' => "user:pass"
397 434
       }
@@ -421,7 +458,7 @@ describe Agents::WebsiteAgent do
421 458
         'mode' => 'on_change',
422 459
         'headers' => { 'foo' => 'bar' },
423 460
         'extract' => {
424
-          'url' => { 'css' => "#comic img", 'attr' => "src" },
461
+          'url' => { 'css' => "#comic img", 'value' => "@src" },
425 462
         }
426 463
       }
427 464
       @checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options)