[Groonga-commit] ranguba/chupa-text at a3b273e [master] Add open-document-text decomposer

Back to archive index
Kouhei Sutou null+****@clear*****
Tue Feb 26 15:31:51 JST 2019


Kouhei Sutou	2019-02-26 15:31:51 +0900 (Tue, 26 Feb 2019)

  Revision: a3b273e9922f756d58da7d157234e414fcf6d7a7
  https://github.com/ranguba/chupa-text/commit/a3b273e9922f756d58da7d157234e414fcf6d7a7

  Message:
    Add open-document-text decomposer

  Added files:
    lib/chupa-text/decomposers/open-document-text.rb
  Modified files:
    lib/chupa-text/decomposers/open-document-presentation.rb
    lib/chupa-text/decomposers/open-document-spreadsheet.rb
    lib/chupa-text/decomposers/open-document.rb
    test/decomposers/test-open-document-text.rb

  Modified: lib/chupa-text/decomposers/open-document-presentation.rb (+12 -42)
===================================================================
--- lib/chupa-text/decomposers/open-document-presentation.rb    2019-02-26 15:04:24 +0900 (67938cd)
+++ lib/chupa-text/decomposers/open-document-presentation.rb    2019-02-26 15:31:51 +0900 (e6176a9)
@@ -27,58 +27,28 @@ module ChupaText
         @mime_type = "application/vnd.oasis.opendocument.presentation"
       end
 
-      def target?(data)
-        data.extension == @extension or
-          data.mime_type == @mime_type
+      private
+      def process_content(entry, context, &block)
+        context[:slides] = []
+        listener = SlidesListener.new(context[:slides])
+        parse(entry.file_data, listener)
       end
 
-      def target_score(data)
-        if target?(data)
-          -1
-        else
-          nil
+      def finish_decompose(context, &block)
+        metadata = TextData.new("", source_data: context[:data])
+        context[:attributes].each do |name, value|
+          metadata[name] = value
         end
-      end
+        yield(metadata)
 
-      def decompose(data)
-        slides = []
-        data.open do |input|
-          Archive::Zip.open(input) do |zip|
-            zip.each do |entry|
-              next unless entry.file?
-              case entry.zip_path
-              when "content.xml"
-                listener = SlidesListener.new(slides)
-                parse(entry.file_data, listener)
-              when "meta.xml"
-                attributes = {}
-                listener = AttributesListener.new(attributes)
-                parse(entry.file_data, listener)
-                metadata = TextData.new("", source_data: data)
-                attributes.each do |name, value|
-                  metadata[name] = value
-                end
-                yield(metadata)
-              end
-            end
-          end
-        end
-        slides.each_with_index do |slide, i|
+        (context[:slides] || []).each_with_index do |slide, i|
           text = slide[:text]
-          text_data = TextData.new(text, source_data: data)
+          text_data = TextData.new(text, source_data: context[:data])
           text_data["index"] = i
           yield(text_data)
         end
       end
 
-      private
-      def parse(io, listener)
-        source = REXML::Source.new(io.read)
-        parser = REXML::Parsers::SAX2Parser.new(source)
-        parser.listen(listener)
-        parser.parse
-      end
-
       class SlidesListener
         include REXML::SAX2Listener
 

  Modified: lib/chupa-text/decomposers/open-document-spreadsheet.rb (+12 -42)
===================================================================
--- lib/chupa-text/decomposers/open-document-spreadsheet.rb    2019-02-26 15:04:24 +0900 (a4c7eff)
+++ lib/chupa-text/decomposers/open-document-spreadsheet.rb    2019-02-26 15:31:51 +0900 (6f218d9)
@@ -27,45 +27,23 @@ module ChupaText
         @mime_type = "application/vnd.oasis.opendocument.spreadsheet"
       end
 
-      def target?(data)
-        data.extension == @extension or
-          data.mime_type == @mime_type
+      private
+      def process_content(entry, context, &block)
+        context[:sheets] = []
+        listener = SheetsListener.new(context[:sheets])
+        parse(entry.file_data, listener)
       end
 
-      def target_score(data)
-        if target?(data)
-          -1
-        else
-          nil
+      def finish_decompose(context, &block)
+        metadata = TextData.new("", source_data: context[:data])
+        context[:attributes].each do |name, value|
+          metadata[name] = value
         end
-      end
+        yield(metadata)
 
-      def decompose(data)
-        sheets = []
-        data.open do |input|
-          Archive::Zip.open(input) do |zip|
-            zip.each do |entry|
-              next unless entry.file?
-              case entry.zip_path
-              when "content.xml"
-                listener = SheetsListener.new(sheets)
-                parse(entry.file_data, listener)
-              when "meta.xml"
-                attributes = {}
-                listener = AttributesListener.new(attributes)
-                parse(entry.file_data, listener)
-                metadata = TextData.new("", source_data: data)
-                attributes.each do |name, value|
-                  metadata[name] = value
-                end
-                yield(metadata)
-              end
-            end
-          end
-        end
-        sheets.each_with_index do |sheet, i|
+        (context[:sheets] || []).each_with_index do |sheet, i|
           text = sheet[:text]
-          text_data = TextData.new(text, source_data: data)
+          text_data = TextData.new(text, source_data: context[:data])
           text_data["index"] = i
           name = sheet[:name]
           text_data["name"] = name if name
@@ -73,14 +51,6 @@ module ChupaText
         end
       end
 
-      private
-      def parse(io, listener)
-        source = REXML::Source.new(io.read)
-        parser = REXML::Parsers::SAX2Parser.new(source)
-        parser.listen(listener)
-        parser.parse
-      end
-
       class SheetsListener
         include REXML::SAX2Listener
 

  Added: lib/chupa-text/decomposers/open-document-text.rb (+89 -0) 100644
===================================================================
--- /dev/null
+++ lib/chupa-text/decomposers/open-document-text.rb    2019-02-26 15:31:51 +0900 (4984313)
@@ -0,0 +1,89 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+require "chupa-text/decomposers/open-document"
+
+module ChupaText
+  module Decomposers
+    class OpenDocumentText < OpenDocument
+      registry.register("open-document-text", self)
+
+      def initialize(options={})
+        super
+        @extension = "odt"
+        @mime_type = "application/vnd.oasis.opendocument.text"
+      end
+
+      private
+      def process_content(entry, context, &block)
+        context[:text] = ""
+        listener = TextListener.new(context[:text])
+        parse(entry.file_data, listener)
+      end
+
+      def finish_decompose(context, &block)
+        text_data = TextData.new(context[:text] || "",
+                                 source_data: context[:data])
+        context[:attributes].each do |name, value|
+          text_data[name] = value
+        end
+        yield(text_data)
+      end
+
+      class TextListener
+        include REXML::SAX2Listener
+
+        TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
+        def initialize(output)
+          @output = output
+          @in_p = false
+        end
+
+        def start_element(uri, local_name, qname, attributes)
+          return unless uri == TEXT_URI
+          case local_name
+          when "p"
+            @in_p = true
+          end
+        end
+
+        def end_element(uri, local_name, qname)
+          @in_p = false
+
+          return unless uri == TEXT_URI
+          case local_name
+          when "p"
+            @output << "\n"
+          end
+        end
+
+        def characters(text)
+          add_text(text)
+        end
+
+        def cdata(content)
+          add_text(content)
+        end
+
+        private
+        def add_text(text)
+          return unless @in_p
+          @output << CGI.unescapeHTML(text)
+        end
+      end
+    end
+  end
+end

  Modified: lib/chupa-text/decomposers/open-document.rb (+10 -62)
===================================================================
--- lib/chupa-text/decomposers/open-document.rb    2019-02-26 15:04:24 +0900 (635c800)
+++ lib/chupa-text/decomposers/open-document.rb    2019-02-26 15:31:51 +0900 (8aa5ddb)
@@ -23,17 +23,9 @@ require "archive/zip"
 module ChupaText
   module Decomposers
     class OpenDocument < Decomposer
-      registry.register("open-document", self)
-
-      EXTENSIONS = [
-        "odt",
-      ]
-      MIME_TYPES = [
-        "application/vnd.oasis.opendocument.text",
-      ]
       def target?(data)
-        EXTENSIONS.include?(data.extension) or
-          MIME_TYPES.include?(data.mime_type)
+        data.extension == @extension or
+          data.mime_type == @mime_type
       end
 
       def target_score(data)
@@ -44,9 +36,9 @@ module ChupaText
         end
       end
 
-      def decompose(data)
+      def decompose(data, &block)
         context = {
-          text: "",
+          data: data,
           attributes: {},
         }
         data.open do |input|
@@ -55,21 +47,14 @@ module ChupaText
               next unless entry.file?
               case entry.zip_path
               when "content.xml"
-                listener = TextListener.new(context[:text])
-                parse(entry.file_data, listener)
+                process_content(entry, context, &block)
               when "meta.xml"
-                listener = AttributesListener.new(context[:attributes])
-                parse(entry.file_data, listener)
+                process_meta(entry, context, &block)
               end
             end
           end
         end
-        text = context[:text]
-        text_data = TextData.new(text, source_data: data)
-        context[:attributes].each do |name, value|
-          text_data[name] = value
-        end
-        yield(text_data)
+        finish_decompose(context, &block)
       end
 
       private
@@ -80,46 +65,9 @@ module ChupaText
         parser.parse
       end
 
-      class TextListener
-        include REXML::SAX2Listener
-
-        TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
-        def initialize(output)
-          @output = output
-          @in_p = false
-        end
-
-        def start_element(uri, local_name, qname, attributes)
-          return unless uri == TEXT_URI
-          case local_name
-          when "p"
-            @in_p = true
-          end
-        end
-
-        def end_element(uri, local_name, qname)
-          @in_p = false
-
-          return unless uri == TEXT_URI
-          case local_name
-          when "p"
-            @output << "\n"
-          end
-        end
-
-        def characters(text)
-          add_text(text)
-        end
-
-        def cdata(content)
-          add_text(content)
-        end
-
-        private
-        def add_text(text)
-          return unless @in_p
-          @output << CGI.unescapeHTML(text)
-        end
+      def process_meta(entry, context, &block)
+        listener = AttributesListener.new(context[:attributes])
+        parse(entry.file_data, listener)
       end
 
       class AttributesListener

  Modified: test/decomposers/test-open-document-text.rb (+1 -1)
===================================================================
--- test/decomposers/test-open-document-text.rb    2019-02-26 15:04:24 +0900 (783d641)
+++ test/decomposers/test-open-document-text.rb    2019-02-26 15:31:51 +0900 (41acf84)
@@ -18,7 +18,7 @@ class TestDecomposersOpenDocumentText < Test::Unit::TestCase
   include Helper
 
   def setup
-    @decomposer = ChupaText::Decomposers::OpenDocument.new({})
+    @decomposer = ChupaText::Decomposers::OpenDocumentText.new({})
   end
 
   def decompose(path)
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190226/209bdbd5/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index