Fix transcoding error when an unknown charset is found

Trying to convert from ASCII-8BIT to UTF-8 almost always fails when a
string contains a 8-bit character, so in this case the second argument
of encode!() may not be nil but UTF-8.

Akinori MUSHA 9 lat temu
rodzic
commit
8e9013d9ea
2 zmienionych plików z 66 dodań i 14 usunięć
  1. 2 2
      app/concerns/web_request_concern.rb
  2. 64 12
      spec/models/agents/website_agent_spec.rb

+ 2 - 2
app/concerns/web_request_concern.rb

@@ -39,7 +39,7 @@ module WebRequestConcern
39 39
           # detection, so we do that.
40 40
           case env[:response_headers][:content_type]
41 41
           when /;\s*charset\s*=\s*([^()<>@,;:\\\"\/\[\]?={}\s]+)/i
42
-            encoding = Encoding.find($1) rescue nil
42
+            encoding = Encoding.find($1) rescue @default_encoding
43 43
           when /\A\s*(?:text\/[^\s;]+|application\/(?:[^\s;]+\+)?(?:xml|json))\s*(?:;|\z)/i
44 44
             encoding = @default_encoding
45 45
           else
@@ -47,7 +47,7 @@ module WebRequestConcern
47 47
             next
48 48
           end
49 49
         end
50
-        body.encode!(Encoding::UTF_8, encoding) unless body.encoding == Encoding::UTF_8
50
+        body.encode!(Encoding::UTF_8, encoding)
51 51
       end
52 52
     end
53 53
   end

+ 64 - 12
spec/models/agents/website_agent_spec.rb

@@ -262,11 +262,11 @@ describe Agents::WebsiteAgent do
262 262
     describe 'encoding' do
263 263
       it 'should be forced with force_encoding option' do
264 264
         huginn = "\u{601d}\u{8003}"
265
-        stub_request(:any, /no-encoding/).to_return(:body => {
266
-            :value => huginn,
267
-          }.to_json.encode(Encoding::EUC_JP), :headers => {
265
+        stub_request(:any, /no-encoding/).to_return(body: {
266
+            value: huginn,
267
+          }.to_json.encode(Encoding::EUC_JP).b, headers: {
268 268
             'Content-Type' => 'application/json',
269
-          }, :status => 200)
269
+          }, status: 200)
270 270
         site = {
271 271
           'name' => "Some JSON Response",
272 272
           'expected_update_period_in_days' => "2",
@@ -278,22 +278,22 @@ describe Agents::WebsiteAgent do
278 278
           },
279 279
           'force_encoding' => 'EUC-JP',
280 280
         }
281
-        checker = Agents::WebsiteAgent.new(:name => "No Encoding Site", :options => site)
281
+        checker = Agents::WebsiteAgent.new(name: "No Encoding Site", options: site)
282 282
         checker.user = users(:bob)
283 283
         checker.save!
284 284
 
285
-        checker.check
285
+        expect { checker.check }.to change { Event.count }.by(1)
286 286
         event = Event.last
287 287
         expect(event.payload['value']).to eq(huginn)
288 288
       end
289 289
 
290 290
       it 'should be overridden with force_encoding option' do
291 291
         huginn = "\u{601d}\u{8003}"
292
-        stub_request(:any, /wrong-encoding/).to_return(:body => {
293
-            :value => huginn,
294
-          }.to_json.encode(Encoding::EUC_JP), :headers => {
292
+        stub_request(:any, /wrong-encoding/).to_return(body: {
293
+            value: huginn,
294
+          }.to_json.encode(Encoding::EUC_JP).b, headers: {
295 295
             'Content-Type' => 'application/json; UTF-8',
296
-          }, :status => 200)
296
+          }, status: 200)
297 297
         site = {
298 298
           'name' => "Some JSON Response",
299 299
           'expected_update_period_in_days' => "2",
@@ -305,11 +305,63 @@ describe Agents::WebsiteAgent do
305 305
           },
306 306
           'force_encoding' => 'EUC-JP',
307 307
         }
308
-        checker = Agents::WebsiteAgent.new(:name => "Wrong Encoding Site", :options => site)
308
+        checker = Agents::WebsiteAgent.new(name: "Wrong Encoding Site", options: site)
309 309
         checker.user = users(:bob)
310 310
         checker.save!
311 311
 
312
-        checker.check
312
+        expect { checker.check }.to change { Event.count }.by(1)
313
+        event = Event.last
314
+        expect(event.payload['value']).to eq(huginn)
315
+      end
316
+
317
+      it 'should be determined by charset in Content-Type' do
318
+        huginn = "\u{601d}\u{8003}"
319
+        stub_request(:any, /charset-euc-jp/).to_return(body: {
320
+            value: huginn,
321
+          }.to_json.encode(Encoding::EUC_JP), headers: {
322
+            'Content-Type' => 'application/json; charset=EUC-JP',
323
+          }, status: 200)
324
+        site = {
325
+          'name' => "Some JSON Response",
326
+          'expected_update_period_in_days' => "2",
327
+          'type' => "json",
328
+          'url' => "http://charset-euc-jp.example.com",
329
+          'mode' => 'on_change',
330
+          'extract' => {
331
+            'value' => { 'path' => 'value' },
332
+          },
333
+        }
334
+        checker = Agents::WebsiteAgent.new(name: "Charset reader", options: site)
335
+        checker.user = users(:bob)
336
+        checker.save!
337
+
338
+        expect { checker.check }.to change { Event.count }.by(1)
339
+        event = Event.last
340
+        expect(event.payload['value']).to eq(huginn)
341
+      end
342
+
343
+      it 'should default to UTF-8 when unknown charset is found' do
344
+        huginn = "\u{601d}\u{8003}"
345
+        stub_request(:any, /charset-unknown/).to_return(body: {
346
+            value: huginn,
347
+          }.to_json.b, headers: {
348
+            'Content-Type' => 'application/json; charset=unicode',
349
+          }, status: 200)
350
+        site = {
351
+          'name' => "Some JSON Response",
352
+          'expected_update_period_in_days' => "2",
353
+          'type' => "json",
354
+          'url' => "http://charset-unknown.example.com",
355
+          'mode' => 'on_change',
356
+          'extract' => {
357
+            'value' => { 'path' => 'value' },
358
+          },
359
+        }
360
+        checker = Agents::WebsiteAgent.new(name: "Charset reader", options: site)
361
+        checker.user = users(:bob)
362
+        checker.save!
363
+
364
+        expect { checker.check }.to change { Event.count }.by(1)
313 365
         event = Event.last
314 366
         expect(event.payload['value']).to eq(huginn)
315 367
       end