[Groonga-commit] ranguba/chupa-text at 64482ea [master] Add support for plain text encoding conversion

Back to archive index
Kouhei Sutou null+****@clear*****
Thu Feb 28 12:43:23 JST 2019


Kouhei Sutou	2019-02-28 12:43:23 +0900 (Thu, 28 Feb 2019)

  Revision: 64482eab0f7143a98f074c8a200b07ba28746016
  https://github.com/ranguba/chupa-text/commit/64482eab0f7143a98f074c8a200b07ba28746016

  Message:
    Add support for plain text encoding conversion

  Modified files:
    lib/chupa-text/extractor.rb
    test/test-extractor.rb

  Modified: lib/chupa-text/extractor.rb (+34 -2)
===================================================================
--- lib/chupa-text/extractor.rb    2019-02-28 12:21:24 +0900 (d564e16)
+++ lib/chupa-text/extractor.rb    2019-02-28 12:43:23 +0900 (403e90a)
@@ -68,11 +68,13 @@ module ChupaText
         if decomposer.nil?
           if target.text_plain?
             debug {"#{log_tag}[extract][text-plain]"}
-            yield(target)
+            yield(ensure_utf8_body_data(target))
             next
           else
             debug {"#{log_tag}[extract][decomposer] not found"}
-            yield(target) if target.text?
+            if target.text?
+              yield(ensure_utf8_body_data(target))
+            end
             next
           end
         end
@@ -98,6 +100,36 @@ module ChupaText
       end
     end
 
+    def ensure_utf8_body_data(data)
+      body = data.body
+      return dat if body.nil?
+
+      encoding = body.encoding
+      case encoding
+      when Encoding::UTF_8
+        return data
+      when Encoding::ASCII_8BIT
+        return data if body.ascii_only?
+      end
+
+      candidates = [
+        Encoding::UTF_8,
+        Encoding::EUC_JP,
+        Encoding::Windows_31J,
+      ]
+      candidates.each do |candidate|
+        body.force_encoding(candidate)
+        if body.valid_encoding?
+          utf8_body = body.encode(Encoding::UTF_8,
+                                  invalid: :replace,
+                                  undef: :replace)
+          return TextData.new(utf8_body, source_data: data)
+        end
+      end
+      body.encoding = encoding
+      data
+    end
+
     def find_decomposer(data)
       candidates = []
       @decomposers.each do |decomposer|

  Modified: test/test-extractor.rb (+24 -1)
===================================================================
--- test/test-extractor.rb    2019-02-28 12:21:24 +0900 (02c81fb)
+++ test/test-extractor.rb    2019-02-28 12:43:23 +0900 (713b89f)
@@ -1,4 +1,4 @@
-# Copyright (C) 2013  Kouhei Sutou <kou****@clear*****>
+# Copyright (C) 2013-2019  Kouhei Sutou <kou****@clear*****>
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -121,5 +121,28 @@ class TestExtractor < Test::Unit::TestCase
         assert_equal(["Hello", "Hello"], extract(data))
       end
     end
+
+    sub_test_case("body") do
+      def test_utf8
+        data = ChupaText::Data.new
+        data.mime_type = "text/plain"
+        data.body = "こんにちは"
+        assert_equal(["こんにちは"], extract(data))
+      end
+
+      def test_cp932
+        data = ChupaText::Data.new
+        data.mime_type = "text/plain"
+        data.body = "こんにちは".encode("cp932")
+        assert_equal(["こんにちは"], extract(data))
+      end
+
+      def test_euc_jp
+        data = ChupaText::Data.new
+        data.mime_type = "text/plain"
+        data.body = "こんにちは".encode("euc-jp")
+        assert_equal(["こんにちは"], extract(data))
+      end
+    end
   end
 end
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190228/6ea223ea/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index