@@ -51,12 +51,13 @@ module LongRunnable |
||
51 | 51 |
end |
52 | 52 |
|
53 | 53 |
class Worker |
54 |
- attr_reader :thread, :id, :agent, :config, :mutex, :scheduler |
|
54 |
+ attr_reader :thread, :id, :agent, :config, :mutex, :scheduler, :restarting |
|
55 | 55 |
|
56 | 56 |
def initialize(options = {}) |
57 | 57 |
@id = options[:id] |
58 | 58 |
@agent = options[:agent] |
59 | 59 |
@config = options[:config] |
60 |
+ @restarting = false |
|
60 | 61 |
end |
61 | 62 |
|
62 | 63 |
def run |
@@ -65,6 +66,7 @@ module LongRunnable |
||
65 | 66 |
|
66 | 67 |
def run! |
67 | 68 |
@thread = Thread.new do |
69 |
+ Thread.current[:name] = "#{id}-#{Time.now}" |
|
68 | 70 |
begin |
69 | 71 |
run |
70 | 72 |
rescue SignalException, SystemExit |
@@ -90,18 +92,23 @@ module LongRunnable |
||
90 | 92 |
if respond_to?(:stop) |
91 | 93 |
stop |
92 | 94 |
else |
93 |
- terminate_thread |
|
95 |
+ terminate_thread! |
|
94 | 96 |
end |
95 | 97 |
end |
96 | 98 |
|
97 |
- def terminate_thread |
|
98 |
- thread.terminate if thread |
|
99 |
+ def terminate_thread! |
|
100 |
+ if thread |
|
101 |
+ thread.terminate |
|
102 |
+ thread.wakeup if thread.status == 'sleep' |
|
103 |
+ end |
|
99 | 104 |
end |
100 | 105 |
|
101 | 106 |
def restart! |
102 |
- stop! |
|
103 |
- setup!(scheduler, mutex) |
|
104 |
- run! |
|
107 |
+ without_alive_check do |
|
108 |
+ stop! |
|
109 |
+ setup!(scheduler, mutex) |
|
110 |
+ run! |
|
111 |
+ end |
|
105 | 112 |
end |
106 | 113 |
|
107 | 114 |
def every(*args, &blk) |
@@ -125,5 +132,12 @@ module LongRunnable |
||
125 | 132 |
def schedule(method, args, &blk) |
126 | 133 |
@scheduler.send(method, *args, tag: id, &blk) |
127 | 134 |
end |
135 |
+ |
|
136 |
+ def without_alive_check(&blk) |
|
137 |
+ @restarting = true |
|
138 |
+ yield |
|
139 |
+ ensure |
|
140 |
+ @restarting = false |
|
141 |
+ end |
|
128 | 142 |
end |
129 | 143 |
end |
@@ -1,6 +1,7 @@ |
||
1 | 1 |
module Agents |
2 | 2 |
class EventFormattingAgent < Agent |
3 | 3 |
cannot_be_scheduled! |
4 |
+ can_dry_run! |
|
4 | 5 |
|
5 | 6 |
description <<-MD |
6 | 7 |
The Event Formatting Agent allows you to format incoming Events, adding new fields as needed. |
@@ -159,7 +159,7 @@ module Agents |
||
159 | 159 |
@filter_to_agent_map = @config[:filter_to_agent_map] |
160 | 160 |
|
161 | 161 |
schedule_in RELOAD_TIMEOUT do |
162 |
- puts "--> Restarting TwitterStream #{id}" |
|
162 |
+ puts "--> Restarting TwitterStream #{id} at #{Time.now} <--" |
|
163 | 163 |
restart! |
164 | 164 |
end |
165 | 165 |
end |
@@ -176,7 +176,7 @@ module Agents |
||
176 | 176 |
|
177 | 177 |
def stop |
178 | 178 |
EventMachine.stop_event_loop if EventMachine.reactor_running? |
179 |
- thread.terminate |
|
179 |
+ terminate_thread! |
|
180 | 180 |
end |
181 | 181 |
|
182 | 182 |
private |
@@ -199,16 +199,16 @@ module Agents |
||
199 | 199 |
end |
200 | 200 |
|
201 | 201 |
stream.on_error do |message| |
202 |
- STDERR.puts " --> Twitter error: #{message} <--" |
|
202 |
+ STDERR.puts " --> Twitter error: #{message} at #{Time.now} <--" |
|
203 | 203 |
end |
204 | 204 |
|
205 | 205 |
stream.on_no_data do |message| |
206 |
- STDERR.puts " --> Got no data for awhile; trying to reconnect." |
|
206 |
+ STDERR.puts " --> Got no data for awhile; trying to reconnect at #{Time.now} <--" |
|
207 | 207 |
restart! |
208 | 208 |
end |
209 | 209 |
|
210 | 210 |
stream.on_max_reconnects do |timeout, retries| |
211 |
- STDERR.puts " --> Oops, tried too many times! <--" |
|
211 |
+ STDERR.puts " --> Oops, tried too many times! at #{Time.now} <--" |
|
212 | 212 |
sleep 60 |
213 | 213 |
restart! |
214 | 214 |
end |
@@ -222,20 +222,18 @@ module Agents |
||
222 | 222 |
status['text'] = status['text'].gsub(/</, "<").gsub(/>/, ">").gsub(/[\t\n\r]/, ' ') |
223 | 223 |
|
224 | 224 |
if status["retweeted_status"].present? && status["retweeted_status"].is_a?(Hash) |
225 |
- puts "Skipping retweet: #{status["text"]}" |
|
226 | 225 |
return |
227 | 226 |
elsif @recent_tweets.include?(status["id_str"]) |
228 |
- puts "Skipping duplicate tweet: #{status["text"]}" |
|
227 |
+ puts "(#{Time.now}) Skipping duplicate tweet: #{status["text"]}" |
|
229 | 228 |
return |
230 | 229 |
end |
231 | 230 |
|
232 | 231 |
@recent_tweets << status["id_str"] |
233 | 232 |
@recent_tweets.shift if @recent_tweets.length > DUPLICATE_DETECTION_LENGTH |
234 |
- puts status["text"] |
|
235 | 233 |
@filter_to_agent_map.keys.each do |filter| |
236 | 234 |
next unless (filter.downcase.split(SEPARATOR) - status["text"].downcase.split(SEPARATOR)).reject(&:empty?) == [] # Hacky McHackerson |
237 | 235 |
@filter_to_agent_map[filter].each do |agent| |
238 |
- puts " -> #{agent.name}" |
|
236 |
+ puts "(#{Time.now}) #{agent.name} received: #{status["text"]}" |
|
239 | 237 |
AgentRunner.with_connection do |
240 | 238 |
agent.process_tweet(filter, status) |
241 | 239 |
end |
@@ -264,8 +264,9 @@ module Agents |
||
264 | 264 |
error "Ignoring a non-HTTP url: #{url.inspect}" |
265 | 265 |
return |
266 | 266 |
end |
267 |
- log "Fetching #{url}" |
|
268 |
- response = faraday.get(url) |
|
267 |
+ uri = Utils.normalize_uri(url) |
|
268 |
+ log "Fetching #{uri}" |
|
269 |
+ response = faraday.get(uri) |
|
269 | 270 |
raise "Failed: #{response.inspect}" unless response.success? |
270 | 271 |
|
271 | 272 |
interpolation_context.stack { |
@@ -303,7 +304,7 @@ module Agents |
||
303 | 304 |
interpolated['extract'].keys.each do |name| |
304 | 305 |
result[name] = output[name][index] |
305 | 306 |
if name.to_s == 'url' |
306 |
- result[name] = (response.env[:url] + result[name]).to_s |
|
307 |
+ result[name] = (response.env[:url] + Utils.normalize_uri(result[name])).to_s |
|
307 | 308 |
end |
308 | 309 |
end |
309 | 310 |
|
@@ -101,7 +101,7 @@ class AgentRunner |
||
101 | 101 |
|
102 | 102 |
def restart_dead_workers |
103 | 103 |
@workers.each_pair do |id, worker| |
104 |
- if worker.thread && !worker.thread.alive? |
|
104 |
+ if !worker.restarting && worker.thread && !worker.thread.alive? |
|
105 | 105 |
puts "Restarting #{id.to_s}" unless Rails.env.test? |
106 | 106 |
@workers[id].run! |
107 | 107 |
end |
@@ -21,6 +21,18 @@ module Utils |
||
21 | 21 |
end |
22 | 22 |
end |
23 | 23 |
|
24 |
+ def self.normalize_uri(uri) |
|
25 |
+ begin |
|
26 |
+ URI(uri) |
|
27 |
+ rescue URI::Error |
|
28 |
+ URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe| |
|
29 |
+ unsafe.bytes.each_with_object(String.new) { |uc, s| |
|
30 |
+ s << sprintf('%%%02X', uc) |
|
31 |
+ } |
|
32 |
+ }.force_encoding(Encoding::US_ASCII)) |
|
33 |
+ end |
|
34 |
+ end |
|
35 |
+ |
|
24 | 36 |
def self.interpolate_jsonpaths(value, data, options = {}) |
25 | 37 |
if options[:leading_dollarsign_is_jsonpath] && value[0] == '$' |
26 | 38 |
Utils.values_at(data, value).first.to_s |
@@ -46,7 +46,7 @@ describe LongRunnable do |
||
46 | 46 |
end |
47 | 47 |
|
48 | 48 |
after(:each) do |
49 |
- @worker.thread.terminate if @worker.thread |
|
49 |
+ @worker.thread.terminate if @worker.thread && !@skip_thread_terminate |
|
50 | 50 |
@scheduler.shutdown(:wait) |
51 | 51 |
end |
52 | 52 |
|
@@ -81,7 +81,7 @@ describe LongRunnable do |
||
81 | 81 |
|
82 | 82 |
context "#stop!" do |
83 | 83 |
it "terminates the thread" do |
84 |
- mock.proxy(@worker).terminate_thread |
|
84 |
+ mock.proxy(@worker).terminate_thread! |
|
85 | 85 |
@worker.stop! |
86 | 86 |
end |
87 | 87 |
|
@@ -91,6 +91,28 @@ describe LongRunnable do |
||
91 | 91 |
end |
92 | 92 |
end |
93 | 93 |
|
94 |
+ context "#terminate_thread!" do |
|
95 |
+ before do |
|
96 |
+ @skip_thread_terminate = true |
|
97 |
+ mock_thread = Object.new |
|
98 |
+ stub(@worker).thread { mock_thread } |
|
99 |
+ end |
|
100 |
+ |
|
101 |
+ it "terminates the thread" do |
|
102 |
+ mock(@worker.thread).terminate |
|
103 |
+ do_not_allow(@worker.thread).wakeup |
|
104 |
+ mock(@worker.thread).status { 'run' } |
|
105 |
+ @worker.terminate_thread! |
|
106 |
+ end |
|
107 |
+ |
|
108 |
+ it "wakes up sleeping threads after termination" do |
|
109 |
+ mock(@worker.thread).terminate |
|
110 |
+ mock(@worker.thread).wakeup |
|
111 |
+ mock(@worker.thread).status { 'sleep' } |
|
112 |
+ @worker.terminate_thread! |
|
113 |
+ end |
|
114 |
+ end |
|
115 |
+ |
|
94 | 116 |
context "#restart!" do |
95 | 117 |
it "stops, setups and starts the worker" do |
96 | 118 |
mock(@worker).stop! |
@@ -117,4 +139,4 @@ describe LongRunnable do |
||
117 | 139 |
end |
118 | 140 |
end |
119 | 141 |
end |
120 |
-end |
|
142 |
+end |
@@ -0,0 +1,17 @@ |
||
1 |
+<html> |
|
2 |
+ <head> |
|
3 |
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> |
|
4 |
+ <title>test</title> |
|
5 |
+ </head> |
|
6 |
+ <body> |
|
7 |
+ <ul> |
|
8 |
+ <li><a href="http://google.com">google</a></li> |
|
9 |
+ <li><a href="https://www.google.ca/search?q=some query">broken</a></li> |
|
10 |
+ <li><a href="https://www.google.ca/search?q=some%20query">escaped</a></li> |
|
11 |
+ <li><a href="http://ko.wikipedia.org/wiki/위키백과:대문">unicode url</a></li> |
|
12 |
+ <li><a href="https://www.google.ca/search?q=위키백과:대문">unicode param</a></li> |
|
13 |
+ <li><a href="http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded url</a></li> |
|
14 |
+ <li><a href="https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded param</a></li> |
|
15 |
+ </ul> |
|
16 |
+ </body> |
|
17 |
+</html> |
@@ -192,7 +192,7 @@ describe Agents::TwitterStreamAgent do |
||
192 | 192 |
|
193 | 193 |
context "#stop" do |
194 | 194 |
it "stops the thread" do |
195 |
- mock(@worker.thread).terminate |
|
195 |
+ mock(@worker).terminate_thread! |
|
196 | 196 |
@worker.stop |
197 | 197 |
end |
198 | 198 |
end |
@@ -221,20 +221,20 @@ describe Agents::TwitterStreamAgent do |
||
221 | 221 |
context "callback handling" do |
222 | 222 |
it "logs error messages" do |
223 | 223 |
stub_without(:on_error).on_error.yields('woups') |
224 |
- mock(STDERR).puts(" --> Twitter error: woups <--") |
|
224 |
+ mock(STDERR).puts(anything) { |text| expect(text).to match(/woups/) } |
|
225 | 225 |
@worker.send(:stream!, ['agent'], @agent) |
226 | 226 |
end |
227 | 227 |
|
228 | 228 |
it "stop when no data was received"do |
229 | 229 |
stub_without(:on_no_data).on_no_data.yields |
230 | 230 |
mock(@worker).restart! |
231 |
- mock(STDERR).puts(" --> Got no data for awhile; trying to reconnect.") |
|
231 |
+ mock(STDERR).puts(anything) |
|
232 | 232 |
@worker.send(:stream!, ['agent'], @agent) |
233 | 233 |
end |
234 | 234 |
|
235 | 235 |
it "sleeps for 60 seconds on_max_reconnects" do |
236 | 236 |
stub_without(:on_max_reconnects).on_max_reconnects.yields |
237 |
- mock(STDERR).puts(" --> Oops, tried too many times! <--") |
|
237 |
+ mock(STDERR).puts(anything) |
|
238 | 238 |
mock(@worker).sleep(60) |
239 | 239 |
mock(@worker).restart! |
240 | 240 |
@worker.send(:stream!, ['agent'], @agent) |
@@ -251,22 +251,21 @@ describe Agents::TwitterStreamAgent do |
||
251 | 251 |
|
252 | 252 |
context "#handle_status" do |
253 | 253 |
it "skips retweets" do |
254 |
- mock.instance_of(IO).puts('Skipping retweet: retweet') |
|
255 |
- @worker.send(:handle_status, {'text' => 'retweet', 'retweeted_status' => {one: true}}) |
|
254 |
+ @worker.send(:handle_status, {'text' => 'retweet', 'retweeted_status' => {one: true}, 'id_str' => '123' }) |
|
255 |
+ expect(@worker.instance_variable_get(:'@recent_tweets')).not_to include('123') |
|
256 | 256 |
end |
257 | 257 |
|
258 | 258 |
it "deduplicates tweets" do |
259 |
- mock.instance_of(IO).puts("dup") |
|
260 |
- @worker.send(:handle_status, {'text' => 'dup', 'id_str' => 1}) |
|
261 |
- mock.instance_of(IO).puts("Skipping duplicate tweet: dup") |
|
262 |
- @worker.send(:handle_status, {'text' => 'dup', 'id_str' => 1}) |
|
259 |
+ @worker.send(:handle_status, {'text' => 'dup', 'id_str' => '1'}) |
|
260 |
+ @worker.send(:handle_status, {'text' => 'dup', 'id_str' => '1'}) |
|
261 |
+ expect(@worker.instance_variable_get(:'@recent_tweets').select { |str| str == '1' }.length).to eq 1 |
|
263 | 262 |
end |
264 | 263 |
|
265 | 264 |
it "calls the agent to process the tweet" do |
266 |
- stub.instance_of(IO).puts |
|
267 | 265 |
mock(@mock_agent).name { 'mock' } |
268 | 266 |
mock(@mock_agent).process_tweet('agent', {'text' => 'agent'}) |
269 |
- @worker.send(:handle_status, {'text' => 'agent'}) |
|
267 |
+ @worker.send(:handle_status, {'text' => 'agent', 'id_str' => '123'}) |
|
268 |
+ expect(@worker.instance_variable_get(:'@recent_tweets')).to include('123') |
|
270 | 269 |
end |
271 | 270 |
end |
272 | 271 |
end |
@@ -911,4 +911,67 @@ fire: hot |
||
911 | 911 |
end |
912 | 912 |
end |
913 | 913 |
end |
914 |
+ |
|
915 |
+ describe "checking urls" do |
|
916 |
+ before do |
|
917 |
+ stub_request(:any, /example/). |
|
918 |
+ to_return(:body => File.read(Rails.root.join("spec/data_fixtures/urlTest.html")), :status => 200) |
|
919 |
+ @valid_options = { |
|
920 |
+ 'name' => "Url Test", |
|
921 |
+ 'expected_update_period_in_days' => "2", |
|
922 |
+ 'type' => "html", |
|
923 |
+ 'url' => "http://www.example.com", |
|
924 |
+ 'mode' => 'all', |
|
925 |
+ 'extract' => { |
|
926 |
+ 'url' => { 'css' => "a", 'value' => "@href" }, |
|
927 |
+ } |
|
928 |
+ } |
|
929 |
+ @checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options) |
|
930 |
+ @checker.user = users(:bob) |
|
931 |
+ @checker.save! |
|
932 |
+ end |
|
933 |
+ |
|
934 |
+ describe "#check" do |
|
935 |
+ before do |
|
936 |
+ expect { @checker.check }.to change { Event.count }.by(7) |
|
937 |
+ @events = Event.last(7) |
|
938 |
+ end |
|
939 |
+ |
|
940 |
+ it "should check hostname" do |
|
941 |
+ event = @events[0] |
|
942 |
+ expect(event.payload['url']).to eq("http://google.com") |
|
943 |
+ end |
|
944 |
+ |
|
945 |
+ it "should check unescaped query" do |
|
946 |
+ event = @events[1] |
|
947 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=some%20query") |
|
948 |
+ end |
|
949 |
+ |
|
950 |
+ it "should check properly escaped query" do |
|
951 |
+ event = @events[2] |
|
952 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=some%20query") |
|
953 |
+ end |
|
954 |
+ |
|
955 |
+ it "should check unescaped unicode url" do |
|
956 |
+ event = @events[3] |
|
957 |
+ expect(event.payload['url']).to eq("http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8") |
|
958 |
+ end |
|
959 |
+ |
|
960 |
+ it "should check unescaped unicode query" do |
|
961 |
+ event = @events[4] |
|
962 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8") |
|
963 |
+ end |
|
964 |
+ |
|
965 |
+ it "should check properly escaped unicode url" do |
|
966 |
+ event = @events[5] |
|
967 |
+ expect(event.payload['url']).to eq("http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8") |
|
968 |
+ end |
|
969 |
+ |
|
970 |
+ it "should check properly escaped unicode query" do |
|
971 |
+ event = @events[6] |
|
972 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8") |
|
973 |
+ end |
|
974 |
+ |
|
975 |
+ end |
|
976 |
+ end |
|
914 | 977 |
end |