Merge pull request #1125 from cantino/website_agent_normalize_uri

Introduce Utils.normalize_uri and use it in WebsiteAgent

Akinori MUSHA 9 years ago
parent
commit
bde6a7269d

+ 4 - 3
app/models/agents/website_agent.rb

@@ -264,8 +264,9 @@ module Agents
264 264
         error "Ignoring a non-HTTP url: #{url.inspect}"
265 265
         return
266 266
       end
267
-      log "Fetching #{url}"
268
-      response = faraday.get(url)
267
+      uri = Utils.normalize_uri(url)
268
+      log "Fetching #{uri}"
269
+      response = faraday.get(uri)
269 270
       raise "Failed: #{response.inspect}" unless response.success?
270 271
 
271 272
       interpolation_context.stack {
@@ -303,7 +304,7 @@ module Agents
303 304
           interpolated['extract'].keys.each do |name|
304 305
             result[name] = output[name][index]
305 306
             if name.to_s == 'url'
306
-              result[name] = (response.env[:url] + result[name]).to_s
307
+              result[name] = (response.env[:url] + Utils.normalize_uri(result[name])).to_s
307 308
             end
308 309
           end
309 310
 

+ 12 - 0
lib/utils.rb

@@ -21,6 +21,18 @@ module Utils
21 21
     end
22 22
   end
23 23
 
24
+  def self.normalize_uri(uri)
25
+    begin
26
+      URI(uri)
27
+    rescue URI::Error
28
+      URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
29
+            unsafe.bytes.each_with_object(String.new) { |uc, s|
30
+              s << sprintf('%%%02X', uc)
31
+            }
32
+          }.force_encoding(Encoding::US_ASCII))
33
+    end
34
+  end
35
+
24 36
   def self.interpolate_jsonpaths(value, data, options = {})
25 37
     if options[:leading_dollarsign_is_jsonpath] && value[0] == '$'
26 38
       Utils.values_at(data, value).first.to_s

+ 17 - 0
spec/data_fixtures/urlTest.html

@@ -0,0 +1,17 @@
1
+<html>
2
+    <head>
3
+        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
4
+        <title>test</title>
5
+    </head>
6
+    <body>
7
+        <ul>
8
+            <li><a href="http://google.com">google</a></li>
9
+            <li><a href="https://www.google.ca/search?q=some query">broken</a></li>
10
+            <li><a href="https://www.google.ca/search?q=some%20query">escaped</a></li>
11
+            <li><a href="http://ko.wikipedia.org/wiki/위키백과:대문">unicode url</a></li>
12
+            <li><a href="https://www.google.ca/search?q=위키백과:대문">unicode param</a></li>
13
+            <li><a href="http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded url</a></li>
14
+            <li><a href="https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded param</a></li>
15
+        </ul>
16
+    </body>
17
+</html>

+ 63 - 0
spec/models/agents/website_agent_spec.rb

@@ -911,4 +911,67 @@ fire: hot
911 911
       end
912 912
     end
913 913
   end
914
+
915
+  describe "checking urls" do
916
+    before do
917
+      stub_request(:any, /example/).
918
+        to_return(:body => File.read(Rails.root.join("spec/data_fixtures/urlTest.html")), :status => 200)
919
+      @valid_options = {
920
+        'name' => "Url Test",
921
+        'expected_update_period_in_days' => "2",
922
+        'type' => "html",
923
+        'url' => "http://www.example.com",
924
+        'mode' => 'all',
925
+        'extract' => {
926
+          'url' => { 'css' => "a", 'value' => "@href" },
927
+        }
928
+      }
929
+      @checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options)
930
+      @checker.user = users(:bob)
931
+      @checker.save!
932
+    end
933
+
934
+    describe "#check" do
935
+      before do
936
+        expect { @checker.check }.to change { Event.count }.by(7)
937
+        @events = Event.last(7)
938
+      end
939
+
940
+      it "should check hostname" do
941
+        event = @events[0]
942
+        expect(event.payload['url']).to eq("http://google.com")
943
+      end
944
+
945
+      it "should check unescaped query" do
946
+        event = @events[1]
947
+        expect(event.payload['url']).to eq("https://www.google.ca/search?q=some%20query")
948
+      end
949
+
950
+      it "should check properly escaped query" do
951
+        event = @events[2]
952
+        expect(event.payload['url']).to eq("https://www.google.ca/search?q=some%20query")
953
+      end
954
+
955
+      it "should check unescaped unicode url" do
956
+        event = @events[3]
957
+        expect(event.payload['url']).to eq("http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
958
+      end
959
+
960
+      it "should check unescaped unicode query" do
961
+        event = @events[4]
962
+        expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
963
+      end
964
+
965
+      it "should check properly escaped unicode url" do
966
+        event = @events[5]
967
+        expect(event.payload['url']).to eq("http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
968
+      end
969
+
970
+      it "should check properly escaped unicode query" do
971
+        event = @events[6]
972
+        expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
973
+      end
974
+
975
+    end
976
+  end
914 977
 end