Merge pull request #164 from cantino/Alex-Ikanow-website_deduplication_improvement

Alex ikanow website deduplication improvement

Andrew Cantino 10 年 前
コミット
5b54cded24
共有2 個のファイルを変更した116 個の追加25 個の削除を含む
  1. 72 20
      app/models/agents/website_agent.rb
  2. 44 5
      spec/models/agents/website_agent_spec.rb

+ 72 - 20
app/models/agents/website_agent.rb

@@ -6,6 +6,11 @@ module Agents
6 6
   class WebsiteAgent < Agent
7 7
     cannot_receive_events!
8 8
 
9
+    default_schedule "every_12h"
10
+
11
+    UNIQUENESS_LOOK_BACK = 200
12
+    UNIQUENESS_FACTOR = 3
13
+
9 14
     description <<-MD
10 15
       The WebsiteAgent scrapes a website, XML document, or JSON feed and creates Events based on the results.
11 16
 
@@ -34,17 +39,15 @@ module Agents
34 39
 
35 40
       Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`.
36 41
 
37
-      Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent.
42
+      Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent.  This is only used to set the "working" status.
43
+
44
+      Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance).  This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
38 45
     MD
39 46
 
40 47
     event_description do
41 48
       "Events will have the fields you specified.  Your options look like:\n\n    #{Utils.pretty_print options['extract']}"
42 49
     end
43 50
 
44
-    default_schedule "every_12h"
45
-
46
-    UNIQUENESS_LOOK_BACK = 30
47
-
48 51
     def working?
49 52
       event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs?
50 53
     end
@@ -54,7 +57,7 @@ module Agents
54 57
           'expected_update_period_in_days' => "2",
55 58
           'url' => "http://xkcd.com",
56 59
           'type' => "html",
57
-          'mode' => :on_change,
60
+          'mode' => "on_change",
58 61
           'extract' => {
59 62
             'url' => {'css' => "#comic img", 'attr' => "src"},
60 63
             'title' => {'css' => "#comic img", 'attr' => "title"}
@@ -63,31 +66,44 @@ module Agents
63 66
     end
64 67
 
65 68
     def validate_options
69
+      # Check for required fields
66 70
       errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present?
67 71
       if !options['extract'].present? && extraction_type != "json"
68 72
         errors.add(:base, "extract is required for all types except json")
69 73
       end
74
+
75
+      # Check for optional fields
76
+      if options['mode'].present?
77
+        errors.add(:base, "mode must be set to on_change or all") unless %w[on_change all].include?(options['mode'])
78
+      end
79
+
80
+      if options['expected_update_period_in_days'].present?
81
+        errors.add(:base, "Invalid expected_update_period_in_days format") unless is_positive_integer?(options['expected_update_period_in_days'])
82
+      end
83
+
84
+      if options['uniqueness_look_back'].present?
85
+        errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back'])
86
+      end
70 87
     end
71 88
 
72 89
     def check
73 90
       hydra = Typhoeus::Hydra.new
74 91
       log "Fetching #{options['url']}"
75
-      request_opts = {:followlocation => true}
76
-      if options['basic_auth'].present?
77
-        request_opts[:userpwd] = options['basic_auth']
78
-      end
92
+      request_opts = { :followlocation => true }
93
+      request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present?
79 94
       request = Typhoeus::Request.new(options['url'], request_opts)
95
+
80 96
       request.on_failure do |response|
81 97
         error "Failed: #{response.inspect}"
82 98
       end
99
+
83 100
       request.on_success do |response|
84 101
         doc = parse(response.body)
85 102
 
86 103
         if extract_full_json?
87
-          result = doc
88
-          if store_payload? result
89
-            log "Storing new result for '#{name}': #{result.inspect}"
90
-            create_event :payload => result
104
+          if store_payload!(previous_payloads(1), doc)
105
+            log "Storing new result for '#{name}': #{doc.inspect}"
106
+            create_event :payload => doc
91 107
           end
92 108
         else
93 109
           output = {}
@@ -116,6 +132,7 @@ module Agents
116 132
             return
117 133
           end
118 134
       
135
+          old_events = previous_payloads num_unique_lengths.first
119 136
           num_unique_lengths.first.times do |index|
120 137
             result = {}
121 138
             options['extract'].keys.each do |name|
@@ -125,7 +142,7 @@ module Agents
125 142
               end
126 143
             end
127 144
 
128
-            if store_payload? result
145
+            if store_payload!(old_events, result)
129 146
               log "Storing new parsed result for '#{name}': #{result.inspect}"
130 147
               create_event :payload => result
131 148
             end
@@ -138,16 +155,43 @@ module Agents
138 155
 
139 156
     private
140 157
 
141
-    def store_payload? result
142
-      !options['mode'] || options['mode'].to_s == "all" || (options['mode'].to_s == "on_change" && !previous_payloads.include?(result.to_json))
158
+    # This method returns true if the result should be stored as a new event.
159
+    # If mode is set to 'on_change', this method may return false and update an existing
160
+    # event to expire further in the future.
161
+    def store_payload!(old_events, result)
162
+      if !options['mode'].present?
163
+        return true
164
+      elsif options['mode'].to_s == "all"
165
+        return true
166
+      elsif options['mode'].to_s == "on_change"
167
+        result_json = result.to_json
168
+        old_events.each do |old_event|
169
+          if old_event.payload.to_json == result_json
170
+            old_event.expires_at = new_event_expiration_date
171
+            old_event.save!
172
+            return false
173
+         end
174
+        end
175
+        return true
176
+      end
177
+      raise "Illegal options[mode]: " + options['mode'].to_s
143 178
     end
144 179
 
145
-    def previous_payloads
146
-      events.order("id desc").limit(UNIQUENESS_LOOK_BACK).pluck(:payload).map(&:to_json) if options['mode'].to_s == "on_change"
180
+    def previous_payloads(num_events)
181
+      if options['uniqueness_look_back'].present?
182
+        look_back = options['uniqueness_look_back'].to_i
183
+      else
184
+        # Larger of UNIQUENESS_FACTOR * num_events and UNIQUENESS_LOOK_BACK
185
+        look_back = UNIQUENESS_FACTOR * num_events
186
+        if look_back < UNIQUENESS_LOOK_BACK
187
+          look_back = UNIQUENESS_LOOK_BACK
188
+        end
189
+      end
190
+      events.order("id desc").limit(look_back) if options['mode'].present? && options['mode'].to_s == "on_change"
147 191
     end
148 192
 
149 193
     def extract_full_json?
150
-      (!options['extract'].present? && extraction_type == "json")
194
+      !options['extract'].present? && extraction_type == "json"
151 195
     end
152 196
 
153 197
     def extraction_type
@@ -174,5 +218,13 @@ module Agents
174 218
           raise "Unknown extraction type #{extraction_type}"
175 219
       end
176 220
     end
221
+
222
+    def is_positive_integer?(value)
223
+      begin
224
+        Integer(value) >= 0
225
+      rescue
226
+        false
227
+      end
228
+    end
177 229
   end
178 230
 end

+ 44 - 5
spec/models/agents/website_agent_spec.rb

@@ -15,15 +15,31 @@ describe Agents::WebsiteAgent do
15 15
           'title' => {'css' => "#comic img", 'attr' => "title"}
16 16
         }
17 17
       }
18
-      @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site)
18
+      @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site, :keep_events_for => 2)
19 19
       @checker.user = users(:bob)
20 20
       @checker.save!
21 21
     end
22 22
 
23 23
     describe "#check" do
24
-      it "should check for changes" do
24
+    
25
+      it "should validate the integer fields" do
26
+        @checker.options['expected_update_period_in_days'] = "nonsense"
27
+        lambda { @checker.save! }.should raise_error;
28
+        @checker.options['expected_update_period_in_days'] = "2"
29
+        @checker.options['uniqueness_look_back'] = "nonsense"
30
+        lambda { @checker.save! }.should raise_error;
31
+        @checker.options['mode'] = "nonsense"
32
+        lambda { @checker.save! }.should raise_error;
33
+        @checker.options = @site
34
+      end
35
+    
36
+      it "should check for changes (and update Event.expires_at)" do
25 37
         lambda { @checker.check }.should change { Event.count }.by(1)
38
+        event = Event.last
39
+        sleep 2
26 40
         lambda { @checker.check }.should_not change { Event.count }
41
+        update_event = Event.last
42
+        update_event.expires_at.should_not == event.expires_at
27 43
       end
28 44
 
29 45
       it "should always save events when in :all mode" do
@@ -35,6 +51,30 @@ describe Agents::WebsiteAgent do
35 51
         }.should change { Event.count }.by(2)
36 52
       end
37 53
 
54
+      it "should take uniqueness_look_back into account during deduplication" do
55
+        @site['mode'] = 'all'
56
+        @checker.options = @site
57
+        @checker.check
58
+        @checker.check
59
+        event = Event.last
60
+        event.payload = "{}"
61
+        event.save
62
+
63
+        lambda {
64
+          @site['mode'] = 'on_change'
65
+          @site['uniqueness_look_back'] = 2
66
+          @checker.options = @site
67
+          @checker.check
68
+        }.should_not change { Event.count }
69
+
70
+        lambda {
71
+          @site['mode'] = 'on_change'
72
+          @site['uniqueness_look_back'] = 1
73
+          @checker.options = @site
74
+          @checker.check
75
+        }.should change { Event.count }.by(1)
76
+      end
77
+
38 78
       it "should log an error if the number of results for a set of extraction patterns differs" do
39 79
         @site['extract']['url']['css'] = "div"
40 80
         @checker.options = @site
@@ -79,7 +119,7 @@ describe Agents::WebsiteAgent do
79 119
           'expected_update_period_in_days' => 2,
80 120
           'type' => "html",
81 121
           'url' => "http://xkcd.com",
82
-          'mode' => :on_change,
122
+          'mode' => "on_change",
83 123
           'extract' => {
84 124
             'url' => {'css' => "#topLeft a", 'attr' => "href"},
85 125
             'title' => {'css' => "#topLeft a", 'text' => "true"}
@@ -216,5 +256,4 @@ describe Agents::WebsiteAgent do
216 256
       end
217 257
     end
218 258
   end
219
-
220
-end
259
+end