@@ -264,8 +264,9 @@ module Agents |
||
264 | 264 |
error "Ignoring a non-HTTP url: #{url.inspect}" |
265 | 265 |
return |
266 | 266 |
end |
267 |
- log "Fetching #{url}" |
|
268 |
- response = faraday.get(url) |
|
267 |
+ uri = Utils.normalize_uri(url) |
|
268 |
+ log "Fetching #{uri}" |
|
269 |
+ response = faraday.get(uri) |
|
269 | 270 |
raise "Failed: #{response.inspect}" unless response.success? |
270 | 271 |
|
271 | 272 |
interpolation_context.stack { |
@@ -303,7 +304,7 @@ module Agents |
||
303 | 304 |
interpolated['extract'].keys.each do |name| |
304 | 305 |
result[name] = output[name][index] |
305 | 306 |
if name.to_s == 'url' |
306 |
- result[name] = (response.env[:url] + result[name]).to_s |
|
307 |
+ result[name] = (response.env[:url] + Utils.normalize_uri(result[name])).to_s |
|
307 | 308 |
end |
308 | 309 |
end |
309 | 310 |
|
@@ -21,6 +21,18 @@ module Utils |
||
21 | 21 |
end |
22 | 22 |
end |
23 | 23 |
|
24 |
+ def self.normalize_uri(uri) |
|
25 |
+ begin |
|
26 |
+ URI(uri) |
|
27 |
+ rescue URI::Error |
|
28 |
+ URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe| |
|
29 |
+ unsafe.bytes.each_with_object(String.new) { |uc, s| |
|
30 |
+ s << sprintf('%%%02X', uc) |
|
31 |
+ } |
|
32 |
+ }.force_encoding(Encoding::US_ASCII)) |
|
33 |
+ end |
|
34 |
+ end |
|
35 |
+ |
|
24 | 36 |
def self.interpolate_jsonpaths(value, data, options = {}) |
25 | 37 |
if options[:leading_dollarsign_is_jsonpath] && value[0] == '$' |
26 | 38 |
Utils.values_at(data, value).first.to_s |
@@ -0,0 +1,17 @@ |
||
1 |
+<html> |
|
2 |
+ <head> |
|
3 |
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> |
|
4 |
+ <title>test</title> |
|
5 |
+ </head> |
|
6 |
+ <body> |
|
7 |
+ <ul> |
|
8 |
+ <li><a href="http://google.com">google</a></li> |
|
9 |
+ <li><a href="https://www.google.ca/search?q=some query">broken</a></li> |
|
10 |
+ <li><a href="https://www.google.ca/search?q=some%20query">escaped</a></li> |
|
11 |
+ <li><a href="http://ko.wikipedia.org/wiki/위키백과:대문">unicode url</a></li> |
|
12 |
+ <li><a href="https://www.google.ca/search?q=위키백과:대문">unicode param</a></li> |
|
13 |
+ <li><a href="http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded url</a></li> |
|
14 |
+ <li><a href="https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded param</a></li> |
|
15 |
+ </ul> |
|
16 |
+ </body> |
|
17 |
+</html> |
@@ -911,4 +911,67 @@ fire: hot |
||
911 | 911 |
end |
912 | 912 |
end |
913 | 913 |
end |
914 |
+ |
|
915 |
+ describe "checking urls" do |
|
916 |
+ before do |
|
917 |
+ stub_request(:any, /example/). |
|
918 |
+ to_return(:body => File.read(Rails.root.join("spec/data_fixtures/urlTest.html")), :status => 200) |
|
919 |
+ @valid_options = { |
|
920 |
+ 'name' => "Url Test", |
|
921 |
+ 'expected_update_period_in_days' => "2", |
|
922 |
+ 'type' => "html", |
|
923 |
+ 'url' => "http://www.example.com", |
|
924 |
+ 'mode' => 'all', |
|
925 |
+ 'extract' => { |
|
926 |
+ 'url' => { 'css' => "a", 'value' => "@href" }, |
|
927 |
+ } |
|
928 |
+ } |
|
929 |
+ @checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options) |
|
930 |
+ @checker.user = users(:bob) |
|
931 |
+ @checker.save! |
|
932 |
+ end |
|
933 |
+ |
|
934 |
+ describe "#check" do |
|
935 |
+ before do |
|
936 |
+ expect { @checker.check }.to change { Event.count }.by(7) |
|
937 |
+ @events = Event.last(7) |
|
938 |
+ end |
|
939 |
+ |
|
940 |
+ it "should check hostname" do |
|
941 |
+ event = @events[0] |
|
942 |
+ expect(event.payload['url']).to eq("http://google.com") |
|
943 |
+ end |
|
944 |
+ |
|
945 |
+ it "should check unescaped query" do |
|
946 |
+ event = @events[1] |
|
947 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=some%20query") |
|
948 |
+ end |
|
949 |
+ |
|
950 |
+ it "should check properly escaped query" do |
|
951 |
+ event = @events[2] |
|
952 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=some%20query") |
|
953 |
+ end |
|
954 |
+ |
|
955 |
+ it "should check unescaped unicode url" do |
|
956 |
+ event = @events[3] |
|
957 |
+ expect(event.payload['url']).to eq("http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8") |
|
958 |
+ end |
|
959 |
+ |
|
960 |
+ it "should check unescaped unicode query" do |
|
961 |
+ event = @events[4] |
|
962 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8") |
|
963 |
+ end |
|
964 |
+ |
|
965 |
+ it "should check properly escaped unicode url" do |
|
966 |
+ event = @events[5] |
|
967 |
+ expect(event.payload['url']).to eq("http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8") |
|
968 |
+ end |
|
969 |
+ |
|
970 |
+ it "should check properly escaped unicode query" do |
|
971 |
+ event = @events[6] |
|
972 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8") |
|
973 |
+ end |
|
974 |
+ |
|
975 |
+ end |
|
976 |
+ end |
|
914 | 977 |
end |