@@ -23,14 +23,16 @@ module Agents |
||
23 | 23 |
|
24 | 24 |
To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes. |
25 | 25 |
|
26 |
- When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `"text": true` or `attr` pointing to an attribute name to grab. An example: |
|
26 |
+ When parsing HTML or XML, these sub-hashes specify how each extraction should be done. The Agent first selects a node set from the document for each extraction key by evaluating either a CSS selector in `css` or an XPath expression in `xpath`. It then evaluates an XPath expression in `value` on each node in the node set, converting the result into string. Here's an example: |
|
27 | 27 |
|
28 | 28 |
"extract": { |
29 |
- "url": { "css": "#comic img", "attr": "src" }, |
|
30 |
- "title": { "css": "#comic img", "attr": "title" }, |
|
31 |
- "body_text": { "css": "div.main", "text": true } |
|
29 |
+ "url": { "css": "#comic img", "value": "@src" }, |
|
30 |
+ "title": { "css": "#comic img", "value": "@title" }, |
|
31 |
+ "body_text": { "css": "div.main", "value": ".//text()" } |
|
32 | 32 |
} |
33 | 33 |
|
34 |
+ "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts. You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc. Note that these functions take a string, not a node set, so what you may think would be written as `normalize-text(.//text())` should actually be `normalize-text(.)`. |
|
35 |
+ |
|
34 | 36 |
When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example: |
35 | 37 |
|
36 | 38 |
"extract": { |
@@ -70,9 +72,9 @@ module Agents |
||
70 | 72 |
'type' => "html", |
71 | 73 |
'mode' => "on_change", |
72 | 74 |
'extract' => { |
73 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" }, |
|
74 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" }, |
|
75 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" } |
|
75 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" }, |
|
76 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" }, |
|
77 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" } |
|
76 | 78 |
} |
77 | 79 |
} |
78 | 80 |
end |
@@ -152,20 +154,21 @@ module Agents |
||
152 | 154 |
error '"css" or "xpath" is required for HTML or XML extraction' |
153 | 155 |
return |
154 | 156 |
end |
155 |
- unless Nokogiri::XML::NodeSet === nodes |
|
157 |
+ case nodes |
|
158 |
+ when Nokogiri::XML::NodeSet |
|
159 |
+ result = nodes.map { |node| |
|
160 |
+ case value = node.xpath(extraction_details['value']) |
|
161 |
+ when Float |
|
162 |
+ # Node#xpath() returns any numeric value as float; |
|
163 |
+ # convert it to integer as appropriate. |
|
164 |
+ value = value.to_i if value.to_i == value |
|
165 |
+ end |
|
166 |
+ value.to_s |
|
167 |
+ } |
|
168 |
+ else |
|
156 | 169 |
error "The result of HTML/XML extraction was not a NodeSet" |
157 | 170 |
return |
158 | 171 |
end |
159 |
- result = nodes.map { |node| |
|
160 |
- if extraction_details['attr'] |
|
161 |
- node.attr(extraction_details['attr']) |
|
162 |
- elsif extraction_details['text'] |
|
163 |
- node.text() |
|
164 |
- else |
|
165 |
- error '"attr" or "text" is required on HTML or XML extraction patterns' |
|
166 |
- return |
|
167 |
- end |
|
168 |
- } |
|
169 | 172 |
log "Extracting #{extraction_type} at #{xpath || css}: #{result}" |
170 | 173 |
end |
171 | 174 |
output[name] = result |
@@ -0,0 +1,30 @@ |
||
1 |
+class AdoptXpathInWebsiteAgent < ActiveRecord::Migration |
|
2 |
+ class Agent < ActiveRecord::Base |
|
3 |
+ include JSONSerializedField |
|
4 |
+ json_serialize :options |
|
5 |
+ end |
|
6 |
+ |
|
7 |
+ def up |
|
8 |
+ Agent.where(type: 'Agents::WebsiteAgent').each do |agent| |
|
9 |
+ extract = agent.options['extract'] |
|
10 |
+ next unless extract.is_a?(Hash) && extract.all? { |name, detail| |
|
11 |
+ detail.key?('xpath') || detail.key?('css') |
|
12 |
+ } |
|
13 |
+ |
|
14 |
+ agent.options_will_change! |
|
15 |
+ agent.options['extract'].each { |name, extraction| |
|
16 |
+ case |
|
17 |
+ when extraction.delete('text') |
|
18 |
+ extraction['value'] = './/text()' |
|
19 |
+ when attr = extraction.delete('attr') |
|
20 |
+ extraction['value'] = "@#{attr}" |
|
21 |
+ end |
|
22 |
+ } |
|
23 |
+ agent.save! |
|
24 |
+ end |
|
25 |
+ end |
|
26 |
+ |
|
27 |
+ def down |
|
28 |
+ raise ActiveRecord::IrreversibleMigration, "Cannot revert this migration" |
|
29 |
+ end |
|
30 |
+end |
@@ -10,8 +10,8 @@ jane_website_agent: |
||
10 | 10 |
:expected_update_period_in_days => 2, |
11 | 11 |
:mode => :on_change, |
12 | 12 |
:extract => { |
13 |
- :title => {:css => "item title", :text => true}, |
|
14 |
- :url => {:css => "item link", :text => true} |
|
13 |
+ :title => {:css => "item title", :value => './/text()'}, |
|
14 |
+ :url => {:css => "item link", :value => './/text()'} |
|
15 | 15 |
} |
16 | 16 |
}.to_json.inspect %> |
17 | 17 |
|
@@ -27,8 +27,8 @@ bob_website_agent: |
||
27 | 27 |
:expected_update_period_in_days => 2, |
28 | 28 |
:mode => :on_change, |
29 | 29 |
:extract => { |
30 |
- :url => {:css => "#comic img", :attr => "src"}, |
|
31 |
- :title => {:css => "#comic img", :attr => "title"} |
|
30 |
+ :url => {:css => "#comic img", :value => "@src"}, |
|
31 |
+ :title => {:css => "#comic img", :value => "@title"} |
|
32 | 32 |
} |
33 | 33 |
}.to_json.inspect %> |
34 | 34 |
|
@@ -768,8 +768,8 @@ describe AgentDrop do |
||
768 | 768 |
url: 'http://dilbert.com/', |
769 | 769 |
mode: 'on_change', |
770 | 770 |
extract: { |
771 |
- url: { css: '[id^=strip_enlarged_] img', attr: 'src' }, |
|
772 |
- title: { css: '.STR_DateStrip', text: true }, |
|
771 |
+ url: { css: '[id^=strip_enlarged_] img', value: '@src' }, |
|
772 |
+ title: { css: '.STR_DateStrip', value: './/text()' }, |
|
773 | 773 |
}, |
774 | 774 |
}, |
775 | 775 |
schedule: 'every_12h', |
@@ -11,9 +11,9 @@ describe Agents::WebsiteAgent do |
||
11 | 11 |
'url' => "http://xkcd.com", |
12 | 12 |
'mode' => 'on_change', |
13 | 13 |
'extract' => { |
14 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" }, |
|
15 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" }, |
|
16 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" } |
|
14 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" }, |
|
15 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" }, |
|
16 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" } |
|
17 | 17 |
} |
18 | 18 |
} |
19 | 19 |
@checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @valid_options, :keep_events_for => 2) |
@@ -256,8 +256,7 @@ describe Agents::WebsiteAgent do |
||
256 | 256 |
'url' => "http://xkcd.com", |
257 | 257 |
'mode' => "on_change", |
258 | 258 |
'extract' => { |
259 |
- 'url' => {'css' => "#topLeft a", 'attr' => "href"}, |
|
260 |
- 'title' => {'css' => "#topLeft a", 'text' => "true"} |
|
259 |
+ 'url' => {'css' => "#topLeft a", 'value' => "@href"}, |
|
261 | 260 |
} |
262 | 261 |
} |
263 | 262 |
rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site) |
@@ -268,6 +267,44 @@ describe Agents::WebsiteAgent do |
||
268 | 267 |
event.payload['url'].should == "http://xkcd.com/about" |
269 | 268 |
end |
270 | 269 |
|
270 |
+ it "should return an integer value if XPath evaluates to one" do |
|
271 |
+ rel_site = { |
|
272 |
+ 'name' => "XKCD", |
|
273 |
+ 'expected_update_period_in_days' => 2, |
|
274 |
+ 'type' => "html", |
|
275 |
+ 'url' => "http://xkcd.com", |
|
276 |
+ 'mode' => "on_change", |
|
277 |
+ 'extract' => { |
|
278 |
+ 'num_links' => {'css' => "#comicLinks", 'value' => "count(./a)"} |
|
279 |
+ } |
|
280 |
+ } |
|
281 |
+ rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site) |
|
282 |
+ rel.user = users(:bob) |
|
283 |
+ rel.save! |
|
284 |
+ rel.check |
|
285 |
+ event = Event.last |
|
286 |
+ event.payload['num_links'].should == "9" |
|
287 |
+ end |
|
288 |
+ |
|
289 |
+ it "should return all texts concatenated if XPath returns many text nodes" do |
|
290 |
+ rel_site = { |
|
291 |
+ 'name' => "XKCD", |
|
292 |
+ 'expected_update_period_in_days' => 2, |
|
293 |
+ 'type' => "html", |
|
294 |
+ 'url' => "http://xkcd.com", |
|
295 |
+ 'mode' => "on_change", |
|
296 |
+ 'extract' => { |
|
297 |
+ 'slogan' => {'css' => "#slogan", 'value' => ".//text()"} |
|
298 |
+ } |
|
299 |
+ } |
|
300 |
+ rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site) |
|
301 |
+ rel.user = users(:bob) |
|
302 |
+ rel.save! |
|
303 |
+ rel.check |
|
304 |
+ event = Event.last |
|
305 |
+ event.payload['slogan'].should == "A webcomic of romance, sarcasm, math, and language." |
|
306 |
+ end |
|
307 |
+ |
|
271 | 308 |
describe "JSON" do |
272 | 309 |
it "works with paths" do |
273 | 310 |
json = { |
@@ -389,9 +426,9 @@ describe Agents::WebsiteAgent do |
||
389 | 426 |
'url' => "http://www.example.com", |
390 | 427 |
'mode' => 'on_change', |
391 | 428 |
'extract' => { |
392 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" }, |
|
393 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" }, |
|
394 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" } |
|
429 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" }, |
|
430 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" }, |
|
431 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" } |
|
395 | 432 |
}, |
396 | 433 |
'basic_auth' => "user:pass" |
397 | 434 |
} |
@@ -421,7 +458,7 @@ describe Agents::WebsiteAgent do |
||
421 | 458 |
'mode' => 'on_change', |
422 | 459 |
'headers' => { 'foo' => 'bar' }, |
423 | 460 |
'extract' => { |
424 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" }, |
|
461 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" }, |
|
425 | 462 |
} |
426 | 463 |
} |
427 | 464 |
@checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options) |