Kouhei Sutou 2019-02-28 14:54:41 +0900 (Thu, 28 Feb 2019) Revision: c6b2e6e78e98b8ef585e005de2aae7fadd9be23d https://github.com/ranguba/chupa-text/commit/c6b2e6e78e98b8ef585e005de2aae7fadd9be23d Message: Improve binary data detection Modified files: lib/chupa-text/data.rb test/test-data.rb Modified: lib/chupa-text/data.rb (+4 -1) =================================================================== --- lib/chupa-text/data.rb 2019-02-28 14:43:15 +0900 (8f561af) +++ lib/chupa-text/data.rb 2019-02-28 14:54:41 +0900 (15ba6f7) @@ -203,7 +203,10 @@ module ChupaText def guess_mime_type_from_body mime_type = nil change_encoding(body, "UTF-8") do |utf8_body| - mime_type = "text/plain" if utf8_body.valid_encoding? + return nil unless utf8_body.valid_encoding? + n_null_characters = utf8_body.count("\u0000") + return nil if n_null_characters > (utf8_body.bytesize * 0.01) + mime_type = "text/plain" end mime_type end Modified: test/test-data.rb (+7 -3) =================================================================== --- test/test-data.rb 2019-02-28 14:43:15 +0900 (9f0f62a) +++ test/test-data.rb 2019-02-28 14:54:41 +0900 (5b71dbb) @@ -1,4 +1,4 @@ -# Copyright (C) 2013 Kouhei Sutou <kou****@clear*****> +# Copyright (C) 2013-2019 Kouhei Sutou <kou****@clear*****> # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -44,11 +44,15 @@ class TestData < Test::Unit::TestCase sub_test_case("body") do def test_txt - body = "Hello" - body.force_encoding("ASCII-8BIT") + body = "Hello".b assert_equal("text/plain", guess(body)) end + def test_utf8_valid_binary + body = "GROONGA:IO:00001@\0\0\0\0\0\0\0\0\0\0".b + assert_nil(guess(body)) + end + private def guess(body) @data.body = body -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190228/09b0e2c7/attachment-0001.html>