[Groonga-commit] groonga/grnxx [master] Add grnxx::Charset and its implementations.

Back to archive index

susumu.yata null+****@clear*****
Tue Mar 12 12:24:32 JST 2013


susumu.yata	2013-03-12 12:24:32 +0900 (Tue, 12 Mar 2013)

  New Revision: 6905bc3103550beb69191caf3432a6a8927d0f0b
  https://github.com/groonga/grnxx/commit/6905bc3103550beb69191caf3432a6a8927d0f0b

  Message:
    Add grnxx::Charset and its implementations.

  Added files:
    lib/charset.cpp
    lib/charset.hpp
    lib/charset/Makefile.am
    lib/charset/euc-jp.cpp
    lib/charset/euc-jp.hpp
    lib/charset/shift_jis.cpp
    lib/charset/shift_jis.hpp
    lib/charset/utf-8.cpp
    lib/charset/utf-8.hpp
  Modified files:
    configure.ac
    lib/Makefile.am

  Modified: configure.ac (+1 -0)
===================================================================
--- configure.ac    2013-03-11 16:45:30 +0900 (8fdf603)
+++ configure.ac    2013-03-12 12:24:32 +0900 (054aa60)
@@ -60,6 +60,7 @@ AC_CHECK_FUNC([nanosleep],
 AC_CONFIG_FILES([Makefile
                  lib/Makefile
                  lib/alpha/Makefile
+                 lib/charset/Makefile
                  lib/db/Makefile
                  lib/io/Makefile
                  lib/map/Makefile

  Modified: lib/Makefile.am (+43 -40)
===================================================================
--- lib/Makefile.am    2013-03-11 16:45:30 +0900 (d08d09b)
+++ lib/Makefile.am    2013-03-12 12:24:32 +0900 (0f3dea6)
@@ -1,53 +1,56 @@
-SUBDIRS = alpha db io map time
+SUBDIRS = alpha charset db io map time
 
 lib_LTLIBRARIES = libgrnxx.la
 
-libgrnxx_la_LIBADD =		\
-	alpha/libgrnxx_alpha.la	\
-	db/libgrnxx_db.la	\
-	io/libgrnxx_io.la	\
-	map/libgrnxx_map.la	\
+libgrnxx_la_LIBADD =			\
+	alpha/libgrnxx_alpha.la		\
+	charset/libgrnxx_charset.la	\
+	db/libgrnxx_db.la		\
+	io/libgrnxx_io.la		\
+	map/libgrnxx_map.la		\
 	time/libgrnxx_time.la
 
 libgrnxx_la_LDFLAGS = @AM_LTLDFLAGS@
 
-libgrnxx_la_SOURCES =		\
-	backtrace.cpp		\
-	error.cpp		\
-	grnxx.cpp		\
-	logger.cpp		\
-	map.cpp			\
-	mutex.cpp		\
-	os.cpp			\
-	recycler.cpp		\
-	slice.cpp		\
-	storage.cpp		\
-	string.cpp		\
-	string_builder.cpp	\
+libgrnxx_la_SOURCES =			\
+	backtrace.cpp			\
+	charset.cpp			\
+	error.cpp			\
+	grnxx.cpp			\
+	logger.cpp			\
+	map.cpp				\
+	mutex.cpp			\
+	os.cpp				\
+	recycler.cpp			\
+	slice.cpp			\
+	storage.cpp			\
+	string.cpp			\
+	string_builder.cpp		\
 	thread.cpp
 
 libgrnxx_includedir = ${includedir}/grnxx
-libgrnxx_include_HEADERS =	\
-	backtrace.hpp		\
-	basic.hpp		\
-	error.hpp		\
-	exception.hpp		\
-	features.hpp		\
-	flags_impl.hpp		\
-	grnxx.hpp		\
-	intrinsic.hpp		\
-	lock.hpp		\
-	logger.hpp		\
-	map.hpp			\
-	mutex.hpp		\
-	os.hpp			\
-	recycler.hpp		\
-	slice.hpp		\
-	storage.hpp		\
-	string.hpp		\
-	string_builder.hpp	\
-	string_format.hpp	\
-	thread.hpp		\
+libgrnxx_include_HEADERS =		\
+	backtrace.hpp			\
+	basic.hpp			\
+	charset.hpp			\
+	error.hpp			\
+	exception.hpp			\
+	features.hpp			\
+	flags_impl.hpp			\
+	grnxx.hpp			\
+	intrinsic.hpp			\
+	lock.hpp			\
+	logger.hpp			\
+	map.hpp				\
+	mutex.hpp			\
+	os.hpp				\
+	recycler.hpp			\
+	slice.hpp			\
+	storage.hpp			\
+	string.hpp			\
+	string_builder.hpp		\
+	string_format.hpp		\
+	thread.hpp			\
 	version.h
 
 EXTRA_DIST = version.sh

  Added: lib/charset.cpp (+66 -0) 100644
===================================================================
--- /dev/null
+++ lib/charset.cpp    2013-03-12 12:24:32 +0900 (04e23d0)
@@ -0,0 +1,66 @@
+/*
+  Copyright (C) 2013  Brazil, Inc.
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include "charset.hpp"
+
+#include "charset/euc-jp.hpp"
+#include "charset/shift_jis.hpp"
+#include "charset/utf-8.hpp"
+
+namespace grnxx {
+
+StringBuilder &operator<<(StringBuilder &builder, CharsetCode code) {
+  switch (code) {
+    case CHARSET_SHIFT_JIS: {
+      return builder << "Shift_JIS";
+    }
+    case CHARSET_EUC_JP: {
+      return builder << "EUC-JP";
+    }
+    case CHARSET_UTF_8: {
+      return builder << "UTF-8";
+    }
+    case CHARSET_UNKNOWN: {
+      break;
+    }
+  }
+  return builder << "n/a";
+}
+
+Charset::Charset() {}
+Charset::~Charset() {}
+
+const Charset *Charset::open(CharsetCode code) {
+  switch (code) {
+    case CHARSET_SHIFT_JIS: {
+      return charset::Shift_JIS::open();
+    }
+    case CHARSET_EUC_JP: {
+      return charset::EUC_JP::open();
+    }
+    case CHARSET_UTF_8: {
+      return charset::UTF_8::open();
+    }
+    case CHARSET_UNKNOWN: {
+      break;
+    }
+  }
+  // TODO: Error handling.
+  return nullptr;
+}
+
+}  // namespace grnxx

  Added: lib/charset.hpp (+56 -0) 100644
===================================================================
--- /dev/null
+++ lib/charset.hpp    2013-03-12 12:24:32 +0900 (0a8f88c)
@@ -0,0 +1,56 @@
+/*
+  Copyright (C) 2013  Brazil, Inc.
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#ifndef GRNXX_CHARSET_HPP
+#define GRNXX_CHARSET_HPP
+
+#include "basic.hpp"
+#include "slice.hpp"
+#include "string_builder.hpp"
+
+namespace grnxx {
+
+// The values correspond to MIB enum numbers.
+// Reference: http://www.iana.org/assignments/character-sets/character-sets.xml
+enum CharsetCode : uint16_t {
+  CHARSET_SHIFT_JIS = 17,
+  CHARSET_EUC_JP    = 18,
+  CHARSET_UTF_8     = 106,
+  CHARSET_UNKNOWN   = 65535
+};
+
+StringBuilder &operator<<(StringBuilder &builder, CharsetCode code);
+
+class Charset {
+ public:
+  Charset();
+  virtual ~Charset();
+
+  // Return a reference to a specific charset.
+  static const Charset *open(CharsetCode code);
+
+  // Return the charset code.
+  virtual CharsetCode code() const = 0;
+
+  // Return the first character of the string "slice". This function may return
+  // an empty slice if "slice" is empty or an invalid sequence.
+  virtual Slice get_char(const Slice &slice) const = 0;
+};
+
+}  // namespace grnxx
+
+#endif  // GRNXX_CHARSET_HPP

  Added: lib/charset/Makefile.am (+14 -0) 100644
===================================================================
--- /dev/null
+++ lib/charset/Makefile.am    2013-03-12 12:24:32 +0900 (25db000)
@@ -0,0 +1,14 @@
+noinst_LTLIBRARIES = libgrnxx_charset.la
+
+libgrnxx_charset_la_LDFLAGS = @AM_LTLDFLAGS@
+
+libgrnxx_charset_la_SOURCES =		\
+	euc-jp.cpp			\
+	shift_jis.cpp			\
+	utf-8.cpp
+
+libgrnxx_charset_includedir = ${includedir}/grnxx/charset
+libgrnxx_charset_include_HEADERS =	\
+	euc-jp.hpp			\
+	shift_jis.hpp			\
+	utf-8.hpp

  Added: lib/charset/euc-jp.cpp (+75 -0) 100644
===================================================================
--- /dev/null
+++ lib/charset/euc-jp.cpp    2013-03-12 12:24:32 +0900 (346f448)
@@ -0,0 +1,75 @@
+/*
+  Copyright (C) 2013  Brazil, Inc.
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include "charset/euc-jp.hpp"
+
+namespace grnxx {
+namespace charset {
+
+const Charset *EUC_JP::open() {
+  static EUC_JP singleton;
+  return &singleton;
+}
+
+CharsetCode EUC_JP::code() const {
+  return CHARSET_EUC_JP;
+}
+
+Slice EUC_JP::get_char(const Slice &slice) const {
+  if (!slice) {
+    return slice;
+  }
+  // Reference: http://ja.wikipedia.org/wiki/EUC-JP
+  if (slice[0] & 0x80) {
+    // 3-byte characters start with 0x8F.
+    if (slice[0] == 0x8F) {
+      // Return an empty slice if the character is incomplete.
+      if (slice.size() < 3) {
+        return slice.prefix(0);
+      }
+      // Return an empty slice if the 2nd byte is invalid.
+      // In fact, only bytes in [A1, A8], [B0, ED], and [F3, FE] are valid.
+      if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) {
+        return slice.prefix(0);
+      }
+      // Return an empty slice if the 3rd byte is invalid.
+      if (static_cast<unsigned>(slice[2] - 0xA1) > (0xFE - 0xA1)) {
+        return slice.prefix(0);
+      }
+      return slice.prefix(3);
+    } else {
+      // Return an empty slice if the 1st byte is invalid.
+      // In fact, only bytes in [A1, A8], [AD, AD], and [B0, FE] are valid.
+      if (static_cast<unsigned>(slice[0] - 0xA1) > (0xFE - 0xA1)) {
+        return slice.prefix(0);
+      }
+      // Return an empty slice if the character is incomplete.
+      if (slice.size() < 2) {
+        return slice.prefix(0);
+      }
+      // Return an empty slice if the 2nd byte is invalid.
+      if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) {
+        return slice.prefix(0);
+      }
+      return slice.prefix(2);
+    }
+  }
+  return slice.prefix(1);
+}
+
+}  // namespace charset
+}  // namespace grnxx

  Added: lib/charset/euc-jp.hpp (+39 -0) 100644
===================================================================
--- /dev/null
+++ lib/charset/euc-jp.hpp    2013-03-12 12:24:32 +0900 (306d127)
@@ -0,0 +1,39 @@
+/*
+  Copyright (C) 2013  Brazil, Inc.
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#ifndef GRNXX_CHARSET_EUC_JP_HPP
+#define GRNXX_CHARSET_EUC_JP_HPP
+
+#include "charset.hpp"
+
+namespace grnxx {
+namespace charset {
+
+// EUC-JP: Extended_UNIX_Code_Packed_Format_for_Japanese.
+class EUC_JP : public Charset {
+ public:
+  static const Charset *open();
+
+  CharsetCode code() const;
+
+  Slice get_char(const Slice &slice) const;
+};
+
+}  // namespace charset
+}  // namespace grnxx
+
+#endif  // GRNXX_CHARSET_EUC_JP_HPP

  Added: lib/charset/shift_jis.cpp (+53 -0) 100644
===================================================================
--- /dev/null
+++ lib/charset/shift_jis.cpp    2013-03-12 12:24:32 +0900 (9ea4f49)
@@ -0,0 +1,53 @@
+/*
+  Copyright (C) 2013  Brazil, Inc.
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include "charset/shift_jis.hpp"
+
+namespace grnxx {
+namespace charset {
+
+const Charset *Shift_JIS::open() {
+  static Shift_JIS singleton;
+  return &singleton;
+}
+
+CharsetCode Shift_JIS::code() const {
+  return CHARSET_SHIFT_JIS;
+}
+
+Slice Shift_JIS::get_char(const Slice &slice) const {
+  if (!slice) {
+    return slice;
+  }
+  // The 1st byte of a multibyte character is in [81, 9F] or [E0, FC].
+  // Reference: http://www.st.rim.or.jp/~phinloda/cqa/cqa15.html#Q4
+  if (static_cast<unsigned>((slice[0] ^ 0x20) - 0xA1) < 0x3C) {
+    // Return an empty slice if the character is incomplete.
+    if (slice.size() < 2) {
+      return slice.prefix(0);
+    }
+    // Return an empty slice if the 2nd byte is invalid.
+    if (static_cast<unsigned>(slice[1] - 0x40) > (0xFC - 0x40)) {
+      return slice.prefix(0);
+    }
+    return slice.prefix(2);
+  }
+  return slice.prefix(1);
+}
+
+}  // namespace charset
+}  // namespace grnxx

  Added: lib/charset/shift_jis.hpp (+39 -0) 100644
===================================================================
--- /dev/null
+++ lib/charset/shift_jis.hpp    2013-03-12 12:24:32 +0900 (a4095f6)
@@ -0,0 +1,39 @@
+/*
+  Copyright (C) 2013  Brazil, Inc.
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#ifndef GRNXX_CHARSET_SHIFT_JIS_HPP
+#define GRNXX_CHARSET_SHIFT_JIS_HPP
+
+#include "charset.hpp"
+
+namespace grnxx {
+namespace charset {
+
+// Shift_JIS.
+class Shift_JIS : public Charset {
+ public:
+  static const Charset *open();
+
+  CharsetCode code() const;
+
+  Slice get_char(const Slice &slice) const;
+};
+
+}  // namespace charset
+}  // namespace grnxx
+
+#endif  // GRNXX_CHARSET_SHIFT_JIS_HPP

  Added: lib/charset/utf-8.cpp (+77 -0) 100644
===================================================================
--- /dev/null
+++ lib/charset/utf-8.cpp    2013-03-12 12:24:32 +0900 (d9e618d)
@@ -0,0 +1,77 @@
+/*
+  Copyright (C) 2013  Brazil, Inc.
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include "charset/utf-8.hpp"
+
+#include "intrinsic.hpp"
+
+namespace grnxx {
+namespace charset {
+
+const Charset *UTF_8::open() {
+  static UTF_8 singleton;
+  return &singleton;
+}
+
+CharsetCode UTF_8::code() const {
+  return CHARSET_UTF_8;
+}
+
+Slice UTF_8::get_char(const Slice &slice) const {
+  if (!slice) {
+    return slice;
+  }
+  if (slice[0] & 0x80) {
+    // A multibyte character can be 2, 3, or 4 bytes long. Also, the 2nd,
+    // 3rd, and 4th byte must be 10xxxxxx, the most significant 2 bits must
+    // be 10.
+    const size_t char_size =
+        31 - bit_scan_reverse(~(static_cast<uint32_t>(slice[0]) << 24));
+    // Return an empty slice if the character is incomplete.
+    if (char_size > slice.size()) {
+      return slice.prefix(0);
+    }
+    switch (char_size) {
+      case 4: {
+        // Return an empty slice if the 4th byte is invalid.
+        if ((slice[3] & 0xC0) != 0x80) {
+          return slice.prefix(0);
+        }
+      }
+      case 3: {
+        // Return an empty slice if the 3rd byte is invalid.
+        if ((slice[2] & 0xC0) != 0x80) {
+          return slice.prefix(0);
+        }
+      }
+      case 2: {
+        // Return an empty slice if the 2nd byte is invalid.
+        if ((slice[1] & 0xC0) != 0x80) {
+          return slice.prefix(0);
+        }
+        return slice.prefix(char_size);
+      }
+      default: {
+        return slice.prefix(0);
+      }
+    }
+  }
+  return slice.prefix(1);
+}
+
+}  // namespace charset
+}  // namespace grnxx

  Added: lib/charset/utf-8.hpp (+39 -0) 100644
===================================================================
--- /dev/null
+++ lib/charset/utf-8.hpp    2013-03-12 12:24:32 +0900 (5ffc75a)
@@ -0,0 +1,39 @@
+/*
+  Copyright (C) 2013  Brazil, Inc.
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#ifndef GRNXX_CHARSET_UTF_8_HPP
+#define GRNXX_CHARSET_UTF_8_HPP
+
+#include "charset.hpp"
+
+namespace grnxx {
+namespace charset {
+
+// UTF-8.
+class UTF_8 : public Charset {
+ public:
+  static const Charset *open();
+
+  CharsetCode code() const;
+
+  Slice get_char(const Slice &slice) const;
+};
+
+}  // namespace charset
+}  // namespace grnxx
+
+#endif  // GRNXX_CHARSET_UTF_8_HPP
-------------- next part --------------
HTML����������������������������...
下载 



More information about the Groonga-commit mailing list
Back to archive index