@@ -20,7 +20,7 @@ module Agents |
||
20 | 20 |
|
21 | 21 |
To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes. |
22 | 22 |
|
23 |
- When parsing HTML or XML, these sub-hashes specify how to extract with a `css` CSS selector and either `'text': true` or `attr` pointing to an attribute name to grab. An example: |
|
23 |
+ When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `'text': true` or `attr` pointing to an attribute name to grab. An example: |
|
24 | 24 |
|
25 | 25 |
'extract': { |
26 | 26 |
'url': { 'css': "#comic img", 'attr': "src" }, |
@@ -109,21 +109,36 @@ module Agents |
||
109 | 109 |
else |
110 | 110 |
output = {} |
111 | 111 |
options['extract'].each do |name, extraction_details| |
112 |
- result = if extraction_type == "json" |
|
113 |
- output[name] = Utils.values_at(doc, extraction_details['path']) |
|
114 |
- else |
|
115 |
- output[name] = doc.css(extraction_details['css']).map { |node| |
|
116 |
- if extraction_details['attr'] |
|
117 |
- node.attr(extraction_details['attr']) |
|
118 |
- elsif extraction_details['text'] |
|
119 |
- node.text() |
|
120 |
- else |
|
121 |
- error "'attr' or 'text' is required on HTML or XML extraction patterns" |
|
122 |
- return |
|
123 |
- end |
|
124 |
- } |
|
125 |
- end |
|
126 |
- log "Extracting #{extraction_type} at #{extraction_details['path'] || extraction_details['css']}: #{result}" |
|
112 |
+ if extraction_type == "json" |
|
113 |
+ result = Utils.values_at(doc, extraction_details['path']) |
|
114 |
+ log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}" |
|
115 |
+ else |
|
116 |
+ case |
|
117 |
+ when css = extraction_details['css'] |
|
118 |
+ nodes = doc.css(css) |
|
119 |
+ when xpath = extraction_details['xpath'] |
|
120 |
+ nodes = doc.xpath(xpath) |
|
121 |
+ else |
|
122 |
+ error "'css' or 'xpath' is required for HTML or XML extraction" |
|
123 |
+ return |
|
124 |
+ end |
|
125 |
+ unless Nokogiri::XML::NodeSet === nodes |
|
126 |
+ error "The result of HTML/XML extraction was not a NodeSet" |
|
127 |
+ return |
|
128 |
+ end |
|
129 |
+ result = nodes.map { |node| |
|
130 |
+ if extraction_details['attr'] |
|
131 |
+ node.attr(extraction_details['attr']) |
|
132 |
+ elsif extraction_details['text'] |
|
133 |
+ node.text() |
|
134 |
+ else |
|
135 |
+ error "'attr' or 'text' is required on HTML or XML extraction patterns" |
|
136 |
+ return |
|
137 |
+ end |
|
138 |
+ } |
|
139 |
+ log "Extracting #{extraction_type} at #{xpath || css}: #{result}" |
|
140 |
+ end |
|
141 |
+ output[name] = result |
|
127 | 142 |
end |
128 | 143 |
|
129 | 144 |
num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq |
@@ -228,4 +243,4 @@ module Agents |
||
228 | 243 |
end |
229 | 244 |
end |
230 | 245 |
end |
231 |
-end |
|
246 |
+end |
@@ -114,6 +114,19 @@ describe Agents::WebsiteAgent do |
||
114 | 114 |
event.payload['hovertext'].should =~ /^Biologists play reverse/ |
115 | 115 |
end |
116 | 116 |
|
117 |
+ it "parses XPath" do |
|
118 |
+ @site['extract'].each { |key, value| |
|
119 |
+ value.delete('css') |
|
120 |
+ value['xpath'] = "//*[@id='comic']//img" |
|
121 |
+ } |
|
122 |
+ @checker.options = @site |
|
123 |
+ @checker.check |
|
124 |
+ event = Event.last |
|
125 |
+ event.payload['url'].should == "http://imgs.xkcd.com/comics/evolving.png" |
|
126 |
+ event.payload['title'].should == "Evolving" |
|
127 |
+ event.payload['hovertext'].should =~ /^Biologists play reverse/ |
|
128 |
+ end |
|
129 |
+ |
|
117 | 130 |
it "should turn relative urls to absolute" do |
118 | 131 |
rel_site = { |
119 | 132 |
'name' => "XKCD", |
@@ -258,4 +271,4 @@ describe Agents::WebsiteAgent do |
||
258 | 271 |
end |
259 | 272 |
end |
260 | 273 |
end |
261 |
-end |
|
274 |
+end |