@@ -23,14 +23,16 @@ module Agents |
||
| 23 | 23 |
|
| 24 | 24 |
To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes. |
| 25 | 25 |
|
| 26 |
- When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `"text": true` or `attr` pointing to an attribute name to grab. An example: |
|
| 26 |
+ When parsing HTML or XML, these sub-hashes specify how each extraction should be done. The Agent first selects a node set from the document for each extraction key by evaluating either a CSS selector in `css` or an XPath expression in `xpath`. It then evaluates an XPath expression in `value` on each node in the node set, converting the result into string. Here's an example: |
|
| 27 | 27 |
|
| 28 | 28 |
"extract": {
|
| 29 |
- "url": { "css": "#comic img", "attr": "src" },
|
|
| 30 |
- "title": { "css": "#comic img", "attr": "title" },
|
|
| 31 |
- "body_text": { "css": "div.main", "text": true }
|
|
| 29 |
+ "url": { "css": "#comic img", "value": "@src" },
|
|
| 30 |
+ "title": { "css": "#comic img", "value": "@title" },
|
|
| 31 |
+ "body_text": { "css": "div.main", "value": "text()" }
|
|
| 32 | 32 |
} |
| 33 | 33 |
|
| 34 |
+ "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and "text()" is to extract the enclosed text. You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc. |
|
| 35 |
+ |
|
| 34 | 36 |
When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example: |
| 35 | 37 |
|
| 36 | 38 |
"extract": {
|
@@ -70,9 +72,9 @@ module Agents |
||
| 70 | 72 |
'type' => "html", |
| 71 | 73 |
'mode' => "on_change", |
| 72 | 74 |
'extract' => {
|
| 73 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" },
|
|
| 74 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" },
|
|
| 75 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
|
|
| 75 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" },
|
|
| 76 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" },
|
|
| 77 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
|
|
| 76 | 78 |
} |
| 77 | 79 |
} |
| 78 | 80 |
end |
@@ -157,14 +159,11 @@ module Agents |
||
| 157 | 159 |
return |
| 158 | 160 |
end |
| 159 | 161 |
result = nodes.map { |node|
|
| 160 |
- if extraction_details['attr'] |
|
| 161 |
- node.attr(extraction_details['attr']) |
|
| 162 |
- elsif extraction_details['text'] |
|
| 163 |
- node.text() |
|
| 164 |
- else |
|
| 165 |
- error '"attr" or "text" is required on HTML or XML extraction patterns' |
|
| 166 |
- return |
|
| 162 |
+ value, = node.xpath(extraction_details['value']) |
|
| 163 |
+ if value.is_a?(Float) && value.to_i == value |
|
| 164 |
+ value = value.to_i |
|
| 167 | 165 |
end |
| 166 |
+ value.to_s |
|
| 168 | 167 |
} |
| 169 | 168 |
log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
|
| 170 | 169 |
end |
@@ -0,0 +1,22 @@ |
||
| 1 |
+class AdoptXpathInWebsiteAgent < ActiveRecord::Migration |
|
| 2 |
+ def up |
|
| 3 |
+ Agent.where(type: 'Agents::WebsiteAgent').each do |agent| |
|
| 4 |
+ next if agent.extraction_type == 'json' |
|
| 5 |
+ |
|
| 6 |
+ agent.options_will_change! |
|
| 7 |
+ agent.options['extract'].each { |name, extraction|
|
|
| 8 |
+ case |
|
| 9 |
+ when extraction.delete('text')
|
|
| 10 |
+ extraction['value'] = 'text()' |
|
| 11 |
+ when attr = extraction.delete('attr')
|
|
| 12 |
+ extraction['value'] = "@#{attr}"
|
|
| 13 |
+ end |
|
| 14 |
+ } |
|
| 15 |
+ agent.save! |
|
| 16 |
+ end |
|
| 17 |
+ end |
|
| 18 |
+ |
|
| 19 |
+ def down |
|
| 20 |
+ raise ActiveRecord::IrreversibleMigration, "Cannot revert this migration" |
|
| 21 |
+ end |
|
| 22 |
+end |
@@ -10,8 +10,8 @@ jane_website_agent: |
||
| 10 | 10 |
:expected_update_period_in_days => 2, |
| 11 | 11 |
:mode => :on_change, |
| 12 | 12 |
:extract => {
|
| 13 |
- :title => {:css => "item title", :text => true},
|
|
| 14 |
- :url => {:css => "item link", :text => true}
|
|
| 13 |
+ :title => {:css => "item title", :value => 'text()'},
|
|
| 14 |
+ :url => {:css => "item link", :value => 'text()'}
|
|
| 15 | 15 |
} |
| 16 | 16 |
}.to_json.inspect %> |
| 17 | 17 |
|
@@ -27,8 +27,8 @@ bob_website_agent: |
||
| 27 | 27 |
:expected_update_period_in_days => 2, |
| 28 | 28 |
:mode => :on_change, |
| 29 | 29 |
:extract => {
|
| 30 |
- :url => {:css => "#comic img", :attr => "src"},
|
|
| 31 |
- :title => {:css => "#comic img", :attr => "title"}
|
|
| 30 |
+ :url => {:css => "#comic img", :value => "@src"},
|
|
| 31 |
+ :title => {:css => "#comic img", :value => "@title"}
|
|
| 32 | 32 |
} |
| 33 | 33 |
}.to_json.inspect %> |
| 34 | 34 |
|
@@ -768,8 +768,8 @@ describe AgentDrop do |
||
| 768 | 768 |
url: 'http://dilbert.com/', |
| 769 | 769 |
mode: 'on_change', |
| 770 | 770 |
extract: {
|
| 771 |
- url: { css: '[id^=strip_enlarged_] img', attr: 'src' },
|
|
| 772 |
- title: { css: '.STR_DateStrip', text: true },
|
|
| 771 |
+ url: { css: '[id^=strip_enlarged_] img', value: '@src' },
|
|
| 772 |
+ title: { css: '.STR_DateStrip', value: 'text()' },
|
|
| 773 | 773 |
}, |
| 774 | 774 |
}, |
| 775 | 775 |
schedule: 'every_12h', |
@@ -11,9 +11,9 @@ describe Agents::WebsiteAgent do |
||
| 11 | 11 |
'url' => "http://xkcd.com", |
| 12 | 12 |
'mode' => 'on_change', |
| 13 | 13 |
'extract' => {
|
| 14 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" },
|
|
| 15 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" },
|
|
| 16 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
|
|
| 14 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" },
|
|
| 15 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" },
|
|
| 16 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
|
|
| 17 | 17 |
} |
| 18 | 18 |
} |
| 19 | 19 |
@checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @valid_options, :keep_events_for => 2) |
@@ -256,8 +256,8 @@ describe Agents::WebsiteAgent do |
||
| 256 | 256 |
'url' => "http://xkcd.com", |
| 257 | 257 |
'mode' => "on_change", |
| 258 | 258 |
'extract' => {
|
| 259 |
- 'url' => {'css' => "#topLeft a", 'attr' => "href"},
|
|
| 260 |
- 'title' => {'css' => "#topLeft a", 'text' => "true"}
|
|
| 259 |
+ 'url' => {'css' => "#topLeft a", 'value' => "@href"},
|
|
| 260 |
+ 'title' => {'css' => "#topLeft a", 'value' => "text()"}
|
|
| 261 | 261 |
} |
| 262 | 262 |
} |
| 263 | 263 |
rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site) |
@@ -389,9 +389,9 @@ describe Agents::WebsiteAgent do |
||
| 389 | 389 |
'url' => "http://www.example.com", |
| 390 | 390 |
'mode' => 'on_change', |
| 391 | 391 |
'extract' => {
|
| 392 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" },
|
|
| 393 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" },
|
|
| 394 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
|
|
| 392 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" },
|
|
| 393 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" },
|
|
| 394 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
|
|
| 395 | 395 |
}, |
| 396 | 396 |
'basic_auth' => "user:pass" |
| 397 | 397 |
} |
@@ -421,7 +421,7 @@ describe Agents::WebsiteAgent do |
||
| 421 | 421 |
'mode' => 'on_change', |
| 422 | 422 |
'headers' => { 'foo' => 'bar' },
|
| 423 | 423 |
'extract' => {
|
| 424 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" },
|
|
| 424 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" },
|
|
| 425 | 425 |
} |
| 426 | 426 |
} |
| 427 | 427 |
@checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options) |