Kouhei Sutou 2019-02-26 15:31:51 +0900 (Tue, 26 Feb 2019) Revision: a3b273e9922f756d58da7d157234e414fcf6d7a7 https://github.com/ranguba/chupa-text/commit/a3b273e9922f756d58da7d157234e414fcf6d7a7 Message: Add open-document-text decomposer Added files: lib/chupa-text/decomposers/open-document-text.rb Modified files: lib/chupa-text/decomposers/open-document-presentation.rb lib/chupa-text/decomposers/open-document-spreadsheet.rb lib/chupa-text/decomposers/open-document.rb test/decomposers/test-open-document-text.rb Modified: lib/chupa-text/decomposers/open-document-presentation.rb (+12 -42) =================================================================== --- lib/chupa-text/decomposers/open-document-presentation.rb 2019-02-26 15:04:24 +0900 (67938cd) +++ lib/chupa-text/decomposers/open-document-presentation.rb 2019-02-26 15:31:51 +0900 (e6176a9) @@ -27,58 +27,28 @@ module ChupaText @mime_type = "application/vnd.oasis.opendocument.presentation" end - def target?(data) - data.extension == @extension or - data.mime_type == @mime_type + private + def process_content(entry, context, &block) + context[:slides] = [] + listener = SlidesListener.new(context[:slides]) + parse(entry.file_data, listener) end - def target_score(data) - if target?(data) - -1 - else - nil + def finish_decompose(context, &block) + metadata = TextData.new("", source_data: context[:data]) + context[:attributes].each do |name, value| + metadata[name] = value end - end + yield(metadata) - def decompose(data) - slides = [] - data.open do |input| - Archive::Zip.open(input) do |zip| - zip.each do |entry| - next unless entry.file? - case entry.zip_path - when "content.xml" - listener = SlidesListener.new(slides) - parse(entry.file_data, listener) - when "meta.xml" - attributes = {} - listener = AttributesListener.new(attributes) - parse(entry.file_data, listener) - metadata = TextData.new("", source_data: data) - attributes.each do |name, value| - metadata[name] = value - end - yield(metadata) - end - end - end - end - slides.each_with_index do |slide, i| + (context[:slides] || []).each_with_index do |slide, i| text = slide[:text] - text_data = TextData.new(text, source_data: data) + text_data = TextData.new(text, source_data: context[:data]) text_data["index"] = i yield(text_data) end end - private - def parse(io, listener) - source = REXML::Source.new(io.read) - parser = REXML::Parsers::SAX2Parser.new(source) - parser.listen(listener) - parser.parse - end - class SlidesListener include REXML::SAX2Listener Modified: lib/chupa-text/decomposers/open-document-spreadsheet.rb (+12 -42) =================================================================== --- lib/chupa-text/decomposers/open-document-spreadsheet.rb 2019-02-26 15:04:24 +0900 (a4c7eff) +++ lib/chupa-text/decomposers/open-document-spreadsheet.rb 2019-02-26 15:31:51 +0900 (6f218d9) @@ -27,45 +27,23 @@ module ChupaText @mime_type = "application/vnd.oasis.opendocument.spreadsheet" end - def target?(data) - data.extension == @extension or - data.mime_type == @mime_type + private + def process_content(entry, context, &block) + context[:sheets] = [] + listener = SheetsListener.new(context[:sheets]) + parse(entry.file_data, listener) end - def target_score(data) - if target?(data) - -1 - else - nil + def finish_decompose(context, &block) + metadata = TextData.new("", source_data: context[:data]) + context[:attributes].each do |name, value| + metadata[name] = value end - end + yield(metadata) - def decompose(data) - sheets = [] - data.open do |input| - Archive::Zip.open(input) do |zip| - zip.each do |entry| - next unless entry.file? - case entry.zip_path - when "content.xml" - listener = SheetsListener.new(sheets) - parse(entry.file_data, listener) - when "meta.xml" - attributes = {} - listener = AttributesListener.new(attributes) - parse(entry.file_data, listener) - metadata = TextData.new("", source_data: data) - attributes.each do |name, value| - metadata[name] = value - end - yield(metadata) - end - end - end - end - sheets.each_with_index do |sheet, i| + (context[:sheets] || []).each_with_index do |sheet, i| text = sheet[:text] - text_data = TextData.new(text, source_data: data) + text_data = TextData.new(text, source_data: context[:data]) text_data["index"] = i name = sheet[:name] text_data["name"] = name if name @@ -73,14 +51,6 @@ module ChupaText end end - private - def parse(io, listener) - source = REXML::Source.new(io.read) - parser = REXML::Parsers::SAX2Parser.new(source) - parser.listen(listener) - parser.parse - end - class SheetsListener include REXML::SAX2Listener Added: lib/chupa-text/decomposers/open-document-text.rb (+89 -0) 100644 =================================================================== --- /dev/null +++ lib/chupa-text/decomposers/open-document-text.rb 2019-02-26 15:31:51 +0900 (4984313) @@ -0,0 +1,89 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +require "chupa-text/decomposers/open-document" + +module ChupaText + module Decomposers + class OpenDocumentText < OpenDocument + registry.register("open-document-text", self) + + def initialize(options={}) + super + @extension = "odt" + @mime_type = "application/vnd.oasis.opendocument.text" + end + + private + def process_content(entry, context, &block) + context[:text] = "" + listener = TextListener.new(context[:text]) + parse(entry.file_data, listener) + end + + def finish_decompose(context, &block) + text_data = TextData.new(context[:text] || "", + source_data: context[:data]) + context[:attributes].each do |name, value| + text_data[name] = value + end + yield(text_data) + end + + class TextListener + include REXML::SAX2Listener + + TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0" + def initialize(output) + @output = output + @in_p = false + end + + def start_element(uri, local_name, qname, attributes) + return unless uri == TEXT_URI + case local_name + when "p" + @in_p = true + end + end + + def end_element(uri, local_name, qname) + @in_p = false + + return unless uri == TEXT_URI + case local_name + when "p" + @output << "\n" + end + end + + def characters(text) + add_text(text) + end + + def cdata(content) + add_text(content) + end + + private + def add_text(text) + return unless @in_p + @output << CGI.unescapeHTML(text) + end + end + end + end +end Modified: lib/chupa-text/decomposers/open-document.rb (+10 -62) =================================================================== --- lib/chupa-text/decomposers/open-document.rb 2019-02-26 15:04:24 +0900 (635c800) +++ lib/chupa-text/decomposers/open-document.rb 2019-02-26 15:31:51 +0900 (8aa5ddb) @@ -23,17 +23,9 @@ require "archive/zip" module ChupaText module Decomposers class OpenDocument < Decomposer - registry.register("open-document", self) - - EXTENSIONS = [ - "odt", - ] - MIME_TYPES = [ - "application/vnd.oasis.opendocument.text", - ] def target?(data) - EXTENSIONS.include?(data.extension) or - MIME_TYPES.include?(data.mime_type) + data.extension == @extension or + data.mime_type == @mime_type end def target_score(data) @@ -44,9 +36,9 @@ module ChupaText end end - def decompose(data) + def decompose(data, &block) context = { - text: "", + data: data, attributes: {}, } data.open do |input| @@ -55,21 +47,14 @@ module ChupaText next unless entry.file? case entry.zip_path when "content.xml" - listener = TextListener.new(context[:text]) - parse(entry.file_data, listener) + process_content(entry, context, &block) when "meta.xml" - listener = AttributesListener.new(context[:attributes]) - parse(entry.file_data, listener) + process_meta(entry, context, &block) end end end end - text = context[:text] - text_data = TextData.new(text, source_data: data) - context[:attributes].each do |name, value| - text_data[name] = value - end - yield(text_data) + finish_decompose(context, &block) end private @@ -80,46 +65,9 @@ module ChupaText parser.parse end - class TextListener - include REXML::SAX2Listener - - TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0" - def initialize(output) - @output = output - @in_p = false - end - - def start_element(uri, local_name, qname, attributes) - return unless uri == TEXT_URI - case local_name - when "p" - @in_p = true - end - end - - def end_element(uri, local_name, qname) - @in_p = false - - return unless uri == TEXT_URI - case local_name - when "p" - @output << "\n" - end - end - - def characters(text) - add_text(text) - end - - def cdata(content) - add_text(content) - end - - private - def add_text(text) - return unless @in_p - @output << CGI.unescapeHTML(text) - end + def process_meta(entry, context, &block) + listener = AttributesListener.new(context[:attributes]) + parse(entry.file_data, listener) end class AttributesListener Modified: test/decomposers/test-open-document-text.rb (+1 -1) =================================================================== --- test/decomposers/test-open-document-text.rb 2019-02-26 15:04:24 +0900 (783d641) +++ test/decomposers/test-open-document-text.rb 2019-02-26 15:31:51 +0900 (41acf84) @@ -18,7 +18,7 @@ class TestDecomposersOpenDocumentText < Test::Unit::TestCase include Helper def setup - @decomposer = ChupaText::Decomposers::OpenDocument.new({}) + @decomposer = ChupaText::Decomposers::OpenDocumentText.new({}) end def decompose(path) -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190226/209bdbd5/attachment-0001.html>