Merge branch 'website_deduplication_improvement' of https://github.com/Alex-Ikanow/huginn into Alex-Ikanow-website_deduplication_improvement

Andrew Cantino 11 年 前
コミット
668240914f
共有2 個のファイルを変更した102 個の追加12 個の削除を含む
  1. 58 8
      app/models/agents/website_agent.rb
  2. 44 4
      spec/models/agents/website_agent_spec.rb

+ 58 - 8
app/models/agents/website_agent.rb

@@ -34,7 +34,9 @@ module Agents
34 34
 
35 35
       Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`.
36 36
 
37
-      Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent.
37
+      Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent (only used to set the "working" status).
38
+
39
+      Set `uniqueness_look_back` (defaults to the larger of 200, 3x the number of received events) to limit the number of events checked for uniqueness (typically for performance).
38 40
     MD
39 41
 
40 42
     event_description do
@@ -43,7 +45,8 @@ module Agents
43 45
 
44 46
     default_schedule "every_12h"
45 47
 
46
-    UNIQUENESS_LOOK_BACK = 30
48
+    UNIQUENESS_LOOK_BACK = 200
49
+    UNIQUENESS_FACTOR = 3
47 50
 
48 51
     def working?
49 52
       event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs?
@@ -63,10 +66,32 @@ module Agents
63 66
     end
64 67
 
65 68
     def validate_options
69
+      # Check required fields are present
66 70
       errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present?
67 71
       if !options['extract'].present? && extraction_type != "json"
68 72
         errors.add(:base, "extract is required for all types except json")
69 73
       end
74
+      # Check options:
75
+      if options['mode'].present?
76
+        if options['mode'] != "on_change" && options['mode'] != "all"
77
+          errors.add(:base, "mode should be all or on_change")
78
+        end
79
+      end
80
+      # Check integer variables:      
81
+      if options['expected_update_period_in_days'].present?
82
+        begin
83
+          Integer(options['expected_update_period_in_days'])
84
+          rescue
85
+          errors.add(:base, "Invalid expected_update_period_in_days format")
86
+        end
87
+      end
88
+      if options['uniqueness_look_back'].present?
89
+        begin
90
+          Integer(options['uniqueness_look_back'])
91
+          rescue
92
+          errors.add(:base, "Invalid uniqueness_look_back format")
93
+        end      
94
+      end
70 95
     end
71 96
 
72 97
     def check
@@ -84,8 +109,9 @@ module Agents
84 109
         doc = parse(response.body)
85 110
 
86 111
         if extract_full_json?
112
+          old_events = previous_payloads 1
87 113
           result = doc
88
-          if store_payload? result
114
+          if store_payload? old_events, result
89 115
             log "Storing new result for '#{name}': #{result.inspect}"
90 116
             create_event :payload => result
91 117
           end
@@ -116,6 +142,7 @@ module Agents
116 142
             return
117 143
           end
118 144
       
145
+          old_events = previous_payloads num_unique_lengths.first
119 146
           num_unique_lengths.first.times do |index|
120 147
             result = {}
121 148
             options['extract'].keys.each do |name|
@@ -125,7 +152,7 @@ module Agents
125 152
               end
126 153
             end
127 154
 
128
-            if store_payload? result
155
+            if store_payload? old_events, result
129 156
               log "Storing new parsed result for '#{name}': #{result.inspect}"
130 157
               create_event :payload => result
131 158
             end
@@ -138,12 +165,35 @@ module Agents
138 165
 
139 166
     private
140 167
 
141
-    def store_payload? result
142
-      !options['mode'] || options['mode'].to_s == "all" || (options['mode'].to_s == "on_change" && !previous_payloads.include?(result.to_json))
168
+    def store_payload?(old_events, result)
169
+      if !options['mode']
170
+        return true
171
+      elsif options['mode'].to_s == "all"
172
+        return true
173
+      elsif options['mode'].to_s == "on_change"
174
+        old_events.each do |old_event|
175
+          if old_event.payload.to_json == result.to_json
176
+            old_event.expires_at = new_event_expiration_date
177
+            old_event.save
178
+            return false
179
+         end
180
+        end
181
+        return true
182
+      end
183
+      raise "Illegal options[mode]: " + options['mode'].to_s
143 184
     end
144 185
 
145
-    def previous_payloads
146
-      events.order("id desc").limit(UNIQUENESS_LOOK_BACK).pluck(:payload).map(&:to_json) if options['mode'].to_s == "on_change"
186
+    def previous_payloads(num_events)
187
+      if options['uniqueness_look_back'].present?
188
+        look_back = options['uniqueness_look_back'].to_i
189
+      else
190
+        # Larger of UNIQUENESS_FACTOR*num_events and UNIQUENESS_LOOK_BACK
191
+        look_back = UNIQUENESS_FACTOR*num_events
192
+        if look_back < UNIQUENESS_LOOK_BACK
193
+          look_back = UNIQUENESS_LOOK_BACK
194
+        end
195
+      end
196
+      events.order("id desc").limit(look_back) if options['mode'].to_s == "on_change"
147 197
     end
148 198
 
149 199
     def extract_full_json?

+ 44 - 4
spec/models/agents/website_agent_spec.rb

@@ -15,15 +15,31 @@ describe Agents::WebsiteAgent do
15 15
           'title' => {'css' => "#comic img", 'attr' => "title"}
16 16
         }
17 17
       }
18
-      @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site)
18
+      @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site, :keep_events_for => 2)
19 19
       @checker.user = users(:bob)
20 20
       @checker.save!
21 21
     end
22 22
 
23 23
     describe "#check" do
24
-      it "should check for changes" do
24
+    
25
+      it "should validate the integer fields" do
26
+        @checker.options['expected_update_period_in_days'] = "nonsense"
27
+        lambda { @checker.save! }.should raise_error;
28
+        @checker.options['expected_update_period_in_days'] = "2"
29
+        @checker.options['uniqueness_look_back'] = "nonsense"
30
+        lambda { @checker.save! }.should raise_error;
31
+        @checker.options['mode'] = "nonsense"
32
+        lambda { @checker.save! }.should raise_error;
33
+        @checker.options = @site
34
+      end
35
+    
36
+      it "should check for changes (and update Event.expires_at)" do
25 37
         lambda { @checker.check }.should change { Event.count }.by(1)
38
+        event = Event.last
39
+        sleep 2
26 40
         lambda { @checker.check }.should_not change { Event.count }
41
+        update_event = Event.last
42
+        update_event.expires_at.should_not == event.expires_at
27 43
       end
28 44
 
29 45
       it "should always save events when in :all mode" do
@@ -35,6 +51,30 @@ describe Agents::WebsiteAgent do
35 51
         }.should change { Event.count }.by(2)
36 52
       end
37 53
 
54
+      it "should take uniqueness_look_back into account during deduplication" do
55
+        @site['mode'] = 'all'
56
+        @checker.options = @site
57
+        @checker.check
58
+        @checker.check
59
+        event = Event.last
60
+        event.payload = "{}"
61
+        event.save
62
+
63
+        lambda {
64
+          @site['mode'] = 'on_change'
65
+          @site['uniqueness_look_back'] = 2
66
+          @checker.options = @site
67
+          @checker.check
68
+        }.should_not change { Event.count }
69
+
70
+        lambda {
71
+          @site['mode'] = 'on_change'
72
+          @site['uniqueness_look_back'] = 1
73
+          @checker.options = @site
74
+          @checker.check
75
+        }.should change { Event.count }.by(1)
76
+      end
77
+
38 78
       it "should log an error if the number of results for a set of extraction patterns differs" do
39 79
         @site['extract']['url']['css'] = "div"
40 80
         @checker.options = @site
@@ -79,7 +119,7 @@ describe Agents::WebsiteAgent do
79 119
           'expected_update_period_in_days' => 2,
80 120
           'type' => "html",
81 121
           'url' => "http://xkcd.com",
82
-          'mode' => :on_change,
122
+          'mode' => "on_change",
83 123
           'extract' => {
84 124
             'url' => {'css' => "#topLeft a", 'attr' => "href"},
85 125
             'title' => {'css' => "#topLeft a", 'text' => "true"}
@@ -217,4 +257,4 @@ describe Agents::WebsiteAgent do
217 257
     end
218 258
   end
219 259
 
220
-end
260
+end