susumu.yata
null+****@clear*****
Tue Mar 12 12:24:32 JST 2013
susumu.yata 2013-03-12 12:24:32 +0900 (Tue, 12 Mar 2013) New Revision: 6905bc3103550beb69191caf3432a6a8927d0f0b https://github.com/groonga/grnxx/commit/6905bc3103550beb69191caf3432a6a8927d0f0b Message: Add grnxx::Charset and its implementations. Added files: lib/charset.cpp lib/charset.hpp lib/charset/Makefile.am lib/charset/euc-jp.cpp lib/charset/euc-jp.hpp lib/charset/shift_jis.cpp lib/charset/shift_jis.hpp lib/charset/utf-8.cpp lib/charset/utf-8.hpp Modified files: configure.ac lib/Makefile.am Modified: configure.ac (+1 -0) =================================================================== --- configure.ac 2013-03-11 16:45:30 +0900 (8fdf603) +++ configure.ac 2013-03-12 12:24:32 +0900 (054aa60) @@ -60,6 +60,7 @@ AC_CHECK_FUNC([nanosleep], AC_CONFIG_FILES([Makefile lib/Makefile lib/alpha/Makefile + lib/charset/Makefile lib/db/Makefile lib/io/Makefile lib/map/Makefile Modified: lib/Makefile.am (+43 -40) =================================================================== --- lib/Makefile.am 2013-03-11 16:45:30 +0900 (d08d09b) +++ lib/Makefile.am 2013-03-12 12:24:32 +0900 (0f3dea6) @@ -1,53 +1,56 @@ -SUBDIRS = alpha db io map time +SUBDIRS = alpha charset db io map time lib_LTLIBRARIES = libgrnxx.la -libgrnxx_la_LIBADD = \ - alpha/libgrnxx_alpha.la \ - db/libgrnxx_db.la \ - io/libgrnxx_io.la \ - map/libgrnxx_map.la \ +libgrnxx_la_LIBADD = \ + alpha/libgrnxx_alpha.la \ + charset/libgrnxx_charset.la \ + db/libgrnxx_db.la \ + io/libgrnxx_io.la \ + map/libgrnxx_map.la \ time/libgrnxx_time.la libgrnxx_la_LDFLAGS = @AM_LTLDFLAGS@ -libgrnxx_la_SOURCES = \ - backtrace.cpp \ - error.cpp \ - grnxx.cpp \ - logger.cpp \ - map.cpp \ - mutex.cpp \ - os.cpp \ - recycler.cpp \ - slice.cpp \ - storage.cpp \ - string.cpp \ - string_builder.cpp \ +libgrnxx_la_SOURCES = \ + backtrace.cpp \ + charset.cpp \ + error.cpp \ + grnxx.cpp \ + logger.cpp \ + map.cpp \ + mutex.cpp \ + os.cpp \ + recycler.cpp \ + slice.cpp \ + storage.cpp \ + string.cpp \ + string_builder.cpp \ thread.cpp libgrnxx_includedir = ${includedir}/grnxx -libgrnxx_include_HEADERS = \ - backtrace.hpp \ - basic.hpp \ - error.hpp \ - exception.hpp \ - features.hpp \ - flags_impl.hpp \ - grnxx.hpp \ - intrinsic.hpp \ - lock.hpp \ - logger.hpp \ - map.hpp \ - mutex.hpp \ - os.hpp \ - recycler.hpp \ - slice.hpp \ - storage.hpp \ - string.hpp \ - string_builder.hpp \ - string_format.hpp \ - thread.hpp \ +libgrnxx_include_HEADERS = \ + backtrace.hpp \ + basic.hpp \ + charset.hpp \ + error.hpp \ + exception.hpp \ + features.hpp \ + flags_impl.hpp \ + grnxx.hpp \ + intrinsic.hpp \ + lock.hpp \ + logger.hpp \ + map.hpp \ + mutex.hpp \ + os.hpp \ + recycler.hpp \ + slice.hpp \ + storage.hpp \ + string.hpp \ + string_builder.hpp \ + string_format.hpp \ + thread.hpp \ version.h EXTRA_DIST = version.sh Added: lib/charset.cpp (+66 -0) 100644 =================================================================== --- /dev/null +++ lib/charset.cpp 2013-03-12 12:24:32 +0900 (04e23d0) @@ -0,0 +1,66 @@ +/* + Copyright (C) 2013 Brazil, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include "charset.hpp" + +#include "charset/euc-jp.hpp" +#include "charset/shift_jis.hpp" +#include "charset/utf-8.hpp" + +namespace grnxx { + +StringBuilder &operator<<(StringBuilder &builder, CharsetCode code) { + switch (code) { + case CHARSET_SHIFT_JIS: { + return builder << "Shift_JIS"; + } + case CHARSET_EUC_JP: { + return builder << "EUC-JP"; + } + case CHARSET_UTF_8: { + return builder << "UTF-8"; + } + case CHARSET_UNKNOWN: { + break; + } + } + return builder << "n/a"; +} + +Charset::Charset() {} +Charset::~Charset() {} + +const Charset *Charset::open(CharsetCode code) { + switch (code) { + case CHARSET_SHIFT_JIS: { + return charset::Shift_JIS::open(); + } + case CHARSET_EUC_JP: { + return charset::EUC_JP::open(); + } + case CHARSET_UTF_8: { + return charset::UTF_8::open(); + } + case CHARSET_UNKNOWN: { + break; + } + } + // TODO: Error handling. + return nullptr; +} + +} // namespace grnxx Added: lib/charset.hpp (+56 -0) 100644 =================================================================== --- /dev/null +++ lib/charset.hpp 2013-03-12 12:24:32 +0900 (0a8f88c) @@ -0,0 +1,56 @@ +/* + Copyright (C) 2013 Brazil, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#ifndef GRNXX_CHARSET_HPP +#define GRNXX_CHARSET_HPP + +#include "basic.hpp" +#include "slice.hpp" +#include "string_builder.hpp" + +namespace grnxx { + +// The values correspond to MIB enum numbers. +// Reference: http://www.iana.org/assignments/character-sets/character-sets.xml +enum CharsetCode : uint16_t { + CHARSET_SHIFT_JIS = 17, + CHARSET_EUC_JP = 18, + CHARSET_UTF_8 = 106, + CHARSET_UNKNOWN = 65535 +}; + +StringBuilder &operator<<(StringBuilder &builder, CharsetCode code); + +class Charset { + public: + Charset(); + virtual ~Charset(); + + // Return a reference to a specific charset. + static const Charset *open(CharsetCode code); + + // Return the charset code. + virtual CharsetCode code() const = 0; + + // Return the first character of the string "slice". This function may return + // an empty slice if "slice" is empty or an invalid sequence. + virtual Slice get_char(const Slice &slice) const = 0; +}; + +} // namespace grnxx + +#endif // GRNXX_CHARSET_HPP Added: lib/charset/Makefile.am (+14 -0) 100644 =================================================================== --- /dev/null +++ lib/charset/Makefile.am 2013-03-12 12:24:32 +0900 (25db000) @@ -0,0 +1,14 @@ +noinst_LTLIBRARIES = libgrnxx_charset.la + +libgrnxx_charset_la_LDFLAGS = @AM_LTLDFLAGS@ + +libgrnxx_charset_la_SOURCES = \ + euc-jp.cpp \ + shift_jis.cpp \ + utf-8.cpp + +libgrnxx_charset_includedir = ${includedir}/grnxx/charset +libgrnxx_charset_include_HEADERS = \ + euc-jp.hpp \ + shift_jis.hpp \ + utf-8.hpp Added: lib/charset/euc-jp.cpp (+75 -0) 100644 =================================================================== --- /dev/null +++ lib/charset/euc-jp.cpp 2013-03-12 12:24:32 +0900 (346f448) @@ -0,0 +1,75 @@ +/* + Copyright (C) 2013 Brazil, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include "charset/euc-jp.hpp" + +namespace grnxx { +namespace charset { + +const Charset *EUC_JP::open() { + static EUC_JP singleton; + return &singleton; +} + +CharsetCode EUC_JP::code() const { + return CHARSET_EUC_JP; +} + +Slice EUC_JP::get_char(const Slice &slice) const { + if (!slice) { + return slice; + } + // Reference: http://ja.wikipedia.org/wiki/EUC-JP + if (slice[0] & 0x80) { + // 3-byte characters start with 0x8F. + if (slice[0] == 0x8F) { + // Return an empty slice if the character is incomplete. + if (slice.size() < 3) { + return slice.prefix(0); + } + // Return an empty slice if the 2nd byte is invalid. + // In fact, only bytes in [A1, A8], [B0, ED], and [F3, FE] are valid. + if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) { + return slice.prefix(0); + } + // Return an empty slice if the 3rd byte is invalid. + if (static_cast<unsigned>(slice[2] - 0xA1) > (0xFE - 0xA1)) { + return slice.prefix(0); + } + return slice.prefix(3); + } else { + // Return an empty slice if the 1st byte is invalid. + // In fact, only bytes in [A1, A8], [AD, AD], and [B0, FE] are valid. + if (static_cast<unsigned>(slice[0] - 0xA1) > (0xFE - 0xA1)) { + return slice.prefix(0); + } + // Return an empty slice if the character is incomplete. + if (slice.size() < 2) { + return slice.prefix(0); + } + // Return an empty slice if the 2nd byte is invalid. + if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) { + return slice.prefix(0); + } + return slice.prefix(2); + } + } + return slice.prefix(1); +} + +} // namespace charset +} // namespace grnxx Added: lib/charset/euc-jp.hpp (+39 -0) 100644 =================================================================== --- /dev/null +++ lib/charset/euc-jp.hpp 2013-03-12 12:24:32 +0900 (306d127) @@ -0,0 +1,39 @@ +/* + Copyright (C) 2013 Brazil, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#ifndef GRNXX_CHARSET_EUC_JP_HPP +#define GRNXX_CHARSET_EUC_JP_HPP + +#include "charset.hpp" + +namespace grnxx { +namespace charset { + +// EUC-JP: Extended_UNIX_Code_Packed_Format_for_Japanese. +class EUC_JP : public Charset { + public: + static const Charset *open(); + + CharsetCode code() const; + + Slice get_char(const Slice &slice) const; +}; + +} // namespace charset +} // namespace grnxx + +#endif // GRNXX_CHARSET_EUC_JP_HPP Added: lib/charset/shift_jis.cpp (+53 -0) 100644 =================================================================== --- /dev/null +++ lib/charset/shift_jis.cpp 2013-03-12 12:24:32 +0900 (9ea4f49) @@ -0,0 +1,53 @@ +/* + Copyright (C) 2013 Brazil, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include "charset/shift_jis.hpp" + +namespace grnxx { +namespace charset { + +const Charset *Shift_JIS::open() { + static Shift_JIS singleton; + return &singleton; +} + +CharsetCode Shift_JIS::code() const { + return CHARSET_SHIFT_JIS; +} + +Slice Shift_JIS::get_char(const Slice &slice) const { + if (!slice) { + return slice; + } + // The 1st byte of a multibyte character is in [81, 9F] or [E0, FC]. + // Reference: http://www.st.rim.or.jp/~phinloda/cqa/cqa15.html#Q4 + if (static_cast<unsigned>((slice[0] ^ 0x20) - 0xA1) < 0x3C) { + // Return an empty slice if the character is incomplete. + if (slice.size() < 2) { + return slice.prefix(0); + } + // Return an empty slice if the 2nd byte is invalid. + if (static_cast<unsigned>(slice[1] - 0x40) > (0xFC - 0x40)) { + return slice.prefix(0); + } + return slice.prefix(2); + } + return slice.prefix(1); +} + +} // namespace charset +} // namespace grnxx Added: lib/charset/shift_jis.hpp (+39 -0) 100644 =================================================================== --- /dev/null +++ lib/charset/shift_jis.hpp 2013-03-12 12:24:32 +0900 (a4095f6) @@ -0,0 +1,39 @@ +/* + Copyright (C) 2013 Brazil, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#ifndef GRNXX_CHARSET_SHIFT_JIS_HPP +#define GRNXX_CHARSET_SHIFT_JIS_HPP + +#include "charset.hpp" + +namespace grnxx { +namespace charset { + +// Shift_JIS. +class Shift_JIS : public Charset { + public: + static const Charset *open(); + + CharsetCode code() const; + + Slice get_char(const Slice &slice) const; +}; + +} // namespace charset +} // namespace grnxx + +#endif // GRNXX_CHARSET_SHIFT_JIS_HPP Added: lib/charset/utf-8.cpp (+77 -0) 100644 =================================================================== --- /dev/null +++ lib/charset/utf-8.cpp 2013-03-12 12:24:32 +0900 (d9e618d) @@ -0,0 +1,77 @@ +/* + Copyright (C) 2013 Brazil, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include "charset/utf-8.hpp" + +#include "intrinsic.hpp" + +namespace grnxx { +namespace charset { + +const Charset *UTF_8::open() { + static UTF_8 singleton; + return &singleton; +} + +CharsetCode UTF_8::code() const { + return CHARSET_UTF_8; +} + +Slice UTF_8::get_char(const Slice &slice) const { + if (!slice) { + return slice; + } + if (slice[0] & 0x80) { + // A multibyte character can be 2, 3, or 4 bytes long. Also, the 2nd, + // 3rd, and 4th byte must be 10xxxxxx, the most significant 2 bits must + // be 10. + const size_t char_size = + 31 - bit_scan_reverse(~(static_cast<uint32_t>(slice[0]) << 24)); + // Return an empty slice if the character is incomplete. + if (char_size > slice.size()) { + return slice.prefix(0); + } + switch (char_size) { + case 4: { + // Return an empty slice if the 4th byte is invalid. + if ((slice[3] & 0xC0) != 0x80) { + return slice.prefix(0); + } + } + case 3: { + // Return an empty slice if the 3rd byte is invalid. + if ((slice[2] & 0xC0) != 0x80) { + return slice.prefix(0); + } + } + case 2: { + // Return an empty slice if the 2nd byte is invalid. + if ((slice[1] & 0xC0) != 0x80) { + return slice.prefix(0); + } + return slice.prefix(char_size); + } + default: { + return slice.prefix(0); + } + } + } + return slice.prefix(1); +} + +} // namespace charset +} // namespace grnxx Added: lib/charset/utf-8.hpp (+39 -0) 100644 =================================================================== --- /dev/null +++ lib/charset/utf-8.hpp 2013-03-12 12:24:32 +0900 (5ffc75a) @@ -0,0 +1,39 @@ +/* + Copyright (C) 2013 Brazil, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#ifndef GRNXX_CHARSET_UTF_8_HPP +#define GRNXX_CHARSET_UTF_8_HPP + +#include "charset.hpp" + +namespace grnxx { +namespace charset { + +// UTF-8. +class UTF_8 : public Charset { + public: + static const Charset *open(); + + CharsetCode code() const; + + Slice get_char(const Slice &slice) const; +}; + +} // namespace charset +} // namespace grnxx + +#endif // GRNXX_CHARSET_UTF_8_HPP -------------- next part -------------- HTML����������������������������... 下载