Add a parser type `text` to WebsiteAgent.

Akinori MUSHA 10 anos atrás
pai
commit
fca8051e81
2 arquivos alterados com 89 adições e 4 exclusões
  1. 37 4
      app/models/agents/website_agent.rb
  2. 52 0
      spec/models/agents/website_agent_spec.rb

+ 37 - 4
app/models/agents/website_agent.rb

@@ -19,7 +19,7 @@ module Agents
19 19
 
20 20
       `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
21 21
 
22
-      The `type` value can be `xml`, `html`, or `json`.
22
+      The `type` value can be `xml`, `html`, `json`, or `text`.
23 23
 
24 24
       To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
25 25
 
@@ -40,6 +40,26 @@ module Agents
40 40
             "description": { "path": "results.data[*].description" }
41 41
           }
42 42
 
43
+      When parsing text, each sub-hash should contain a `regexp` and `index`.  Output text is matched against the regular expression repeatedly from the beginning through to the end, collecting a captured group specified by `index` in each match.  Each index should be either an integer or a string name which corresponds to `(?<_name_>...)`.  For example, to parse lines of `_word_: _definition_`, the following should work:
44
+
45
+          "extract": {
46
+            "word": { "regexp": "^(.+?): (.+)$", index: 1 },
47
+            "definition": { "regexp": "^(.+?): (.+)$", index: 2 },
48
+          }
49
+
50
+      Or if you prefer names to numbers for index:
51
+
52
+          "extract": {
53
+            "word": { "regexp": "^(?<word>.+?): (?<definition>.+)$", index: 'word' },
54
+            "definition": { "regexp": "^(?<word>.+?): (?<definition>.+)$", index: 'definition' },
55
+          }
56
+
57
+      To extract the whole content as one event:
58
+
59
+          "extract": {
60
+            "content": { "regexp": "\A(?:.|\n)*\z", index: 0 },
61
+          }
62
+
43 63
       Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor.  E.g., if you're extracting rows, all extractors must match all rows.  For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
44 64
 
45 65
       Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `"username:password"`, or `["username", "password"]`.
@@ -140,7 +160,15 @@ module Agents
140 160
           else
141 161
             output = {}
142 162
             interpolated['extract'].each do |name, extraction_details|
143
-              if extraction_type == "json"
163
+              case extraction_type
164
+              when "text"
165
+                regexp = Regexp.new(extraction_details['regexp'])
166
+                result = []
167
+                doc.scan(regexp) {
168
+                  result << Regexp.last_match[extraction_details['index']]
169
+                }
170
+                log "Extracting #{extraction_type} at #{regexp}: #{result}"
171
+              when "json"
144 172
                 result = Utils.values_at(doc, extraction_details['path'])
145 173
                 log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
146 174
               else
@@ -253,10 +281,13 @@ module Agents
253 281
 
254 282
     def extraction_type
255 283
       (interpolated['type'] || begin
256
-        if interpolated['url'] =~ /\.(rss|xml)$/i
284
+        case interpolated['url']
285
+        when /\.(rss|xml)$/i
257 286
           "xml"
258
-        elsif interpolated['url'] =~ /\.json$/i
287
+        when /\.json$/i
259 288
           "json"
289
+        when /\.(txt|text)$/i
290
+          "text"
260 291
         else
261 292
           "html"
262 293
         end
@@ -271,6 +302,8 @@ module Agents
271 302
           JSON.parse(data)
272 303
         when "html"
273 304
           Nokogiri::HTML(data)
305
+        when "text"
306
+          data
274 307
         else
275 308
           raise "Unknown extraction type #{extraction_type}"
276 309
       end

+ 52 - 0
spec/models/agents/website_agent_spec.rb

@@ -398,6 +398,58 @@ describe Agents::WebsiteAgent do
398 398
           event.payload['response']['title'].should == "hello!"
399 399
         end
400 400
       end
401
+
402
+      describe "text parsing" do
403
+        before do
404
+          stub_request(:any, /text-site/).to_return(body: <<-EOF, status: 200)
405
+water: wet
406
+fire: hot
407
+          EOF
408
+          site = {
409
+            'name' => 'Some Text Response',
410
+            'expected_update_period_in_days' => '2',
411
+            'type' => 'text',
412
+            'url' => 'http://text-site.com',
413
+            'mode' => 'on_change',
414
+            'extract' => {
415
+              'word' => { 'regexp' => '^(.+?): (.+)$', index: 1 },
416
+              'property' => { 'regexp' => '^(.+?): (.+)$', index: 2 },
417
+            }
418
+          }
419
+          @checker = Agents::WebsiteAgent.new(name: 'Text Site', options: site)
420
+          @checker.user = users(:bob)
421
+          @checker.save!
422
+        end
423
+
424
+        it "works with regexp" do
425
+          @checker.options = @checker.options.merge('extract' => {
426
+            'word' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'word' },
427
+            'property' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'property' },
428
+          })
429
+
430
+          lambda {
431
+            @checker.check
432
+          }.should change { Event.count }.by(2)
433
+
434
+          event1, event2 = Event.last(2)
435
+          event1.payload['word'].should == 'water'
436
+          event1.payload['property'].should == 'wet'
437
+          event2.payload['word'].should == 'fire'
438
+          event2.payload['property'].should == 'hot'
439
+        end
440
+
441
+        it "works with regexp with named capture" do
442
+          lambda {
443
+            @checker.check
444
+          }.should change { Event.count }.by(2)
445
+
446
+          event1, event2 = Event.last(2)
447
+          event1.payload['word'].should == 'water'
448
+          event1.payload['property'].should == 'wet'
449
+          event2.payload['word'].should == 'fire'
450
+          event2.payload['property'].should == 'hot'
451
+        end
452
+      end
401 453
     end
402 454
 
403 455
     describe "#receive" do