@@ -20,7 +20,7 @@ module Agents  | 
            ||
| 20 | 20 | 
                 | 
            
| 21 | 21 | 
                To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.  | 
            
| 22 | 22 | 
                 | 
            
| 23 | 
                - When parsing HTML or XML, these sub-hashes specify how to extract with a `css` CSS selector and either `'text': true` or `attr` pointing to an attribute name to grab. An example:  | 
            |
| 23 | 
                + When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `'text': true` or `attr` pointing to an attribute name to grab. An example:  | 
            |
| 24 | 24 | 
                 | 
            
| 25 | 25 | 
                           'extract': {
               | 
            
| 26 | 26 | 
                             'url': { 'css': "#comic img", 'attr': "src" },
               | 
            
                @@ -109,21 +109,36 @@ module Agents  | 
            ||
| 109 | 109 | 
                else  | 
            
| 110 | 110 | 
                           output = {}
               | 
            
| 111 | 111 | 
                options['extract'].each do |name, extraction_details|  | 
            
| 112 | 
                - result = if extraction_type == "json"  | 
            |
| 113 | 
                - output[name] = Utils.values_at(doc, extraction_details['path'])  | 
            |
| 114 | 
                - else  | 
            |
| 115 | 
                -                       output[name] = doc.css(extraction_details['css']).map { |node|
               | 
            |
| 116 | 
                - if extraction_details['attr']  | 
            |
| 117 | 
                - node.attr(extraction_details['attr'])  | 
            |
| 118 | 
                - elsif extraction_details['text']  | 
            |
| 119 | 
                - node.text()  | 
            |
| 120 | 
                - else  | 
            |
| 121 | 
                - error "'attr' or 'text' is required on HTML or XML extraction patterns"  | 
            |
| 122 | 
                - return  | 
            |
| 123 | 
                - end  | 
            |
| 124 | 
                - }  | 
            |
| 125 | 
                - end  | 
            |
| 126 | 
                -            log "Extracting #{extraction_type} at #{extraction_details['path'] || extraction_details['css']}: #{result}"
               | 
            |
| 112 | 
                + if extraction_type == "json"  | 
            |
| 113 | 
                + result = Utils.values_at(doc, extraction_details['path'])  | 
            |
| 114 | 
                +              log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
               | 
            |
| 115 | 
                + else  | 
            |
| 116 | 
                + case  | 
            |
| 117 | 
                + when css = extraction_details['css']  | 
            |
| 118 | 
                + nodes = doc.css(css)  | 
            |
| 119 | 
                + when xpath = extraction_details['xpath']  | 
            |
| 120 | 
                + nodes = doc.xpath(xpath)  | 
            |
| 121 | 
                + else  | 
            |
| 122 | 
                + error "'css' or 'xpath' is required for HTML or XML extraction"  | 
            |
| 123 | 
                + return  | 
            |
| 124 | 
                + end  | 
            |
| 125 | 
                + unless Nokogiri::XML::NodeSet === nodes  | 
            |
| 126 | 
                + error "The result of HTML/XML extraction was not a NodeSet"  | 
            |
| 127 | 
                + return  | 
            |
| 128 | 
                + end  | 
            |
| 129 | 
                +              result = nodes.map { |node|
               | 
            |
| 130 | 
                + if extraction_details['attr']  | 
            |
| 131 | 
                + node.attr(extraction_details['attr'])  | 
            |
| 132 | 
                + elsif extraction_details['text']  | 
            |
| 133 | 
                + node.text()  | 
            |
| 134 | 
                + else  | 
            |
| 135 | 
                + error "'attr' or 'text' is required on HTML or XML extraction patterns"  | 
            |
| 136 | 
                + return  | 
            |
| 137 | 
                + end  | 
            |
| 138 | 
                + }  | 
            |
| 139 | 
                +              log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
               | 
            |
| 140 | 
                + end  | 
            |
| 141 | 
                + output[name] = result  | 
            |
| 127 | 142 | 
                end  | 
            
| 128 | 143 | 
                 | 
            
| 129 | 144 | 
                           num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
               | 
            
                @@ -228,4 +243,4 @@ module Agents  | 
            ||
| 228 | 243 | 
                end  | 
            
| 229 | 244 | 
                end  | 
            
| 230 | 245 | 
                end  | 
            
| 231 | 
                -end  | 
            |
| 246 | 
                +end  | 
            
                @@ -114,6 +114,19 @@ describe Agents::WebsiteAgent do  | 
            ||
| 114 | 114 | 
                event.payload['hovertext'].should =~ /^Biologists play reverse/  | 
            
| 115 | 115 | 
                end  | 
            
| 116 | 116 | 
                 | 
            
| 117 | 
                + it "parses XPath" do  | 
            |
| 118 | 
                +        @site['extract'].each { |key, value|
               | 
            |
| 119 | 
                +          value.delete('css')
               | 
            |
| 120 | 
                + value['xpath'] = "//*[@id='comic']//img"  | 
            |
| 121 | 
                + }  | 
            |
| 122 | 
                + @checker.options = @site  | 
            |
| 123 | 
                + @checker.check  | 
            |
| 124 | 
                + event = Event.last  | 
            |
| 125 | 
                + event.payload['url'].should == "http://imgs.xkcd.com/comics/evolving.png"  | 
            |
| 126 | 
                + event.payload['title'].should == "Evolving"  | 
            |
| 127 | 
                + event.payload['hovertext'].should =~ /^Biologists play reverse/  | 
            |
| 128 | 
                + end  | 
            |
| 129 | 
                +  | 
            |
| 117 | 130 | 
                it "should turn relative urls to absolute" do  | 
            
| 118 | 131 | 
                         rel_site = {
               | 
            
| 119 | 132 | 
                'name' => "XKCD",  | 
            
                @@ -258,4 +271,4 @@ describe Agents::WebsiteAgent do  | 
            ||
| 258 | 271 | 
                end  | 
            
| 259 | 272 | 
                end  | 
            
| 260 | 273 | 
                end  | 
            
| 261 | 
                -end  | 
            |
| 274 | 
                +end  |