Revision: 7422 http://sourceforge.jp/projects/ttssh2/scm/svn/commits/7422 Author: zmatsuo Date: 2019-02-03 23:43:53 +0900 (Sun, 03 Feb 2019) Log Message: ----------- UTF8,UTF16間の変換を自分で行うようにした (Windows95対策) Modified Paths: -------------- trunk/teraterm/common/codeconv.cpp trunk/teraterm/common/codeconv.h -------------- next part -------------- Modified: trunk/teraterm/common/codeconv.cpp =================================================================== --- trunk/teraterm/common/codeconv.cpp 2019-02-03 02:55:12 UTC (rev 7421) +++ trunk/teraterm/common/codeconv.cpp 2019-02-03 14:43:53 UTC (rev 7422) @@ -29,20 +29,276 @@ #include <windows.h> #include <string.h> #include <crtdbg.h> - +#if (defined(_MSC_VER) && (_MSC_VER >= 1600)) || !defined(_MSC_VER) +#include <stdint.h> +#endif #include "codeconv.h" +#if defined(_MSC_VER) && (_MSC_VER < 1600) +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +#endif + #ifdef _DEBUG #define malloc(l) _malloc_dbg((l), _NORMAL_BLOCK, __FILE__, __LINE__) #define free(p) _free_dbg((p), _NORMAL_BLOCK) +#define _strdup(s) _strdup_dbg((s), _NORMAL_BLOCK, __FILE__, __LINE__) +#define _wcsdup(s) _wcsdup_dbg((s), _NORMAL_BLOCK, __FILE__, __LINE__) #endif -int CP932ToWideChar(const char *cp932_ptr, int cp932_len, wchar_t *wstr_ptr, int wstr_len) +/** + * UTF-32 \x82\xA9\x82\xE7 UTF-8 \x82֕ϊ\xB7\x82\xB7\x82\xE9 + * @param[in] u32 \x95ϊ\xB7\x82\xB7\x82\xE9UTF-32 + * @param[in,out] u8_ptr \x95ϊ\xB7\x8C\xE3UTF-8\x95\xB6\x8E\x9A\x97\xF1\x8Fo\x97͐\xE6(NULL\x82̂Ƃ\xAB\x8Fo\x97͂\xB5\x82Ȃ\xA2) + * @param[in] u8_len UTF-8\x8Fo\x97͐敶\x8E\x9A\x90\x94(\x83o\x83b\x83t\x83@\x92\xB7,byte\x90\x94) + * @retval \x8Eg\x97p\x82\xB5\x82\xBDutf8\x95\xB6\x8E\x9A\x90\x94(byte\x90\x94\x81j + * 0=\x83G\x83\x89\x81[ + */ +size_t UTF32ToUTF8(uint32_t u32, char *u8_ptr_, size_t u8_len) { - return MultiByteToWideChar(932, 0, cp932_ptr, cp932_len, wstr_ptr, wstr_len); + size_t out_len = 0; + uint8_t *u8_ptr = (uint8_t *)u8_ptr_; + if (u8_ptr != NULL) { + u8_len = 4; + } + + if (u32 <= 0x0000007f) { + // 0x00000000 <= u32 <= 0x0000007f + if (u8_len >= 1) { + if (u8_ptr != NULL) { + u8_ptr[0] = (uint8_t)u32; + } + out_len = 1; + } + } else if (u32 <= 0x000007ff) { + // 0x00000080 <= u32 <= 0x000007ff + if (u8_len >= 2) { + if (u8_ptr != NULL) { + u8_ptr[0] = ((u32 >> 6) & 0x1f) | 0xc0; + u8_ptr[1] = (u32 & 0x3f) | 0x80; + } + out_len = 2; + } + } else if (u32 <= 0x0000ffff) { + // 0x00000800 <= u32 <= 0x0000ffff + if (u8_len >= 3) { + if (u8_ptr != NULL) { + u8_ptr[0] = ((u32 >> 12) & 0xf) | 0xe0; + u8_ptr[1] = ((u32 >> 6) & 0x3f) | 0x80; + u8_ptr[2] = (u32 & 0x3f) | 0x80; + } + out_len = 3; + } + } else if (u32 <= 0x0010ffff) { + // 0x00010000 <= u32 <= 0x0010ffff + if (u8_len >= 4) { + if (u8_ptr != NULL) { + u8_ptr[0] = ((uint8_t)(u32 >> 18)) | 0xf0; + u8_ptr[1] = ((u32 >> 12) & 0x3f) | 0x80; + u8_ptr[2] = ((u32 >> 6) & 0x3f) | 0x80; + u8_ptr[3] = (u32 & 0x3f) | 0x80; + } + out_len = 4; + } + } else { + out_len = 0; + } + return out_len; } /** + * UTF-8\x95\xB6\x8E\x9A\x97\xE7UTF-32\x82\xF01\x95\xB6\x8E\x9A\x8E\xE6\x82\xE8\x8Fo\x82\xB7 + * @param[in] u8_ptr UTF-8\x95\xB6\x8E\x9A\x97\xF1\x82ւ̃|\x83C\x83\x93\x83^ + * @param[in] u8_len UTF-8\x95\xB6\x8E\x9A\x97\xB3 + * @param[out] u32 \x95ϊ\xB7\x82\xB5\x82\xBDUTF-32\x95\xB6\x8E\x9A + * @retval \x8Eg\x97p\x82\xB5\x82\xBDUTF-8\x95\xB6\x8E\x9A\x90\x94(byte\x90\x94\x81j + * 0=\x83G\x83\x89\x81[ + */ +size_t UTF8ToUTF32(const char *u8_ptr_, size_t u8_len, uint32_t *u32_) +{ + uint8_t *u8_ptr = (uint8_t *)u8_ptr_; + uint32_t u32; + size_t u8_in; + const uint8_t c1 = *u8_ptr++; + if (c1 <= 0x7f) { + // 1byte + if (u8_len >= 1) { + u32 = (uint32_t)c1; + u8_in = 1; + } else { + goto error; + } + } else if (0xc2 <= c1 && c1 <= 0xdf) { + // 2byte + if (u8_len >= 2) { + const uint8_t c2 = *u8_ptr++; + if (((c1 & 0x1e) != 0) && + ((c2 & 0xc0) == 0x80)) + { + u32 = (uint32_t)((c1 & 0x1f) << 6) + (c2 & 0x3f); + u8_in = 2; + } else { + goto error; + } + } else { + goto error; + } + } else if (0xe0 <= c1 && c1 <= 0xef) { + // 3byte + if (u8_len >= 3) { + const uint8_t c2 = *u8_ptr++; + const uint8_t c3 = *u8_ptr++; + if ((((c1 & 0x0f) != 0) || ((c2 & 0x20) != 0)) && + ((c2 & 0xc0) == 0x80) && + ((c3 & 0xc0) == 0x80) ) + { + u32 = (uint32_t)((c1 & 0x0f) << 12) + ((c2 & 0x3f) << 6); + u32 += (c3 & 0x3f); + u8_in = 3; + } else { + goto error; + } + } else { + goto error; + } + } else if (0xf0 <= c1 && c1 <= 0xf7 && u8_len >= 4) { + // 4byte + if (u8_len >= 4) { + const uint8_t c2 = *u8_ptr++; + const uint8_t c3 = *u8_ptr++; + const uint8_t c4 = *u8_ptr++; + if ((((c1 & 0x07) != 0) || ((c2 & 0x30) != 0)) && + ((c2 & 0xc0) == 0x80) && + ((c3 & 0xc0) == 0x80) && + ((c4 & 0xc0) == 0x80) ) + { + u32 = (uint32_t)((c1 & 0x07) << 18) + ((c2 & 0x3f) << 12); + u32 += ((c3 & 0x3f) << 6) + (c4 & 0x3f); + u8_in = 4; + } else { + goto error; + } + } else { + goto error; + } + } else { + error: + u32 = 0; + u8_in = 0; + } + *u32_ = u32; + return u8_in; +} + +// WideCharToMultiByte\x82\xCCUTF8\x93\xC1\x89\xBB\x94\xC5 +int WideCharToUTF8(const wchar_t *wstr_ptr, int wstr_len, char *u8_ptr, int u8_len) +{ + int u8_out_sum = 0; + if (u8_ptr == NULL) { + u8_len = 4; + } else { + if (u8_len == 0) { + return 0; + } + } + if (wstr_len < 0) { + wstr_len = (int)wcslen(wstr_ptr) + 1; + } + + while(u8_len > 0 && wstr_len > 0) { + const wchar_t u16 = *wstr_ptr++; + uint32_t u32 = u16; + size_t u8_out; + wstr_len--; + // \x83T\x83\x8D\x83Q\x81[\x83g high? + if (0xd800 <= u16 && u16 < 0xdc00) { + if (wstr_len >= 1) { + const wchar_t u16_lo = *wstr_ptr++; + wstr_len--; + // \x83T\x83\x8D\x83Q\x81[\x83g low? + if (0xdc00 <= u16_lo && u16_lo < 0xe000) { + // \x83T\x83\x8D\x83Q\x81[\x83g\x83y\x83A \x83f\x83R\x81[\x83h + u32 = 0x10000 + (u16 - 0xd800) * 0x400 + (u16_lo - 0xdc00); + } else { + goto unknown_code; + } + } else { + unknown_code: + if (u8_ptr != NULL) { + *u8_ptr++ = '?'; + } + u8_out = 1; + goto loop_next; + } + } + u8_out = UTF32ToUTF8(u32, u8_ptr, u8_len); + if (u8_out == 0) { + goto unknown_code; + } + loop_next: + u8_out_sum += u8_out; + if (u8_ptr != NULL) { + u8_ptr += u8_out; + u8_len -= u8_out; + } + } + return u8_out_sum; +} + +// MultiByteToWideChar\x82\xCCUTF8\x93\xC1\x89\xBB\x94\xC5 +int UTF8ToWideChar(const char *u8_ptr, int u8_len, wchar_t *wstr_ptr, int wstr_len) +{ + size_t u16_out_sum = 0; + if (u8_len < 0) { + u8_len = strlen(u8_ptr) + 1; + } + if (wstr_ptr == NULL) { + wstr_len = 1; + } + + while(wstr_len > 0 && u8_len > 0) { + uint32_t u32; + size_t u16_out; + size_t u8_in = UTF8ToUTF32(u8_ptr, u8_len, &u32); + if (u8_in == 0) { + u32 = '?'; + u8_in = 1; + } + u8_ptr += u8_in; + u8_len -= u8_in; + + if (u32 < 0x10000) { + if (wstr_ptr != NULL) { + *wstr_ptr++ = (uint16_t)u32; + } + u16_out = 1; + } else if (u32 <= 0x10ffff) { + if (wstr_len > 2) { + if (wstr_ptr != NULL) { + // \x83T\x83\x8D\x83Q\x81[\x83g \x83G\x83\x93\x83R\x81[\x83h + *wstr_ptr++ = uint16_t((u32 - 0x10000) / 0x400) + 0xd800; + *wstr_ptr++ = uint16_t((u32 - 0x10000) % 0x400) + 0xdc00; + } + u16_out = 2; + } else { + *wstr_ptr++ = '?'; + u16_out = 1; + } + } else { + *wstr_ptr++ = '?'; + u16_out = 1; + } + + if (wstr_ptr != NULL) { + wstr_len -= u16_out; + } + u16_out_sum += u16_out; + } + return u16_out_sum; +} + +/** * wchar_t\x95\xB6\x8E\x9A\x97\xF1\x82\xF0\x83}\x83\x8B\x83`\x83o\x83C\x83g\x95\xB6\x8E\x9A\x97\xF1\x82֕ϊ\xB7 * @param[in] *wstr_ptr wchar_t\x95\xB6\x8E\x9A\x97\xF1 * @param[in] wstr_len wchar_t\x95\xB6\x8E\x9A\x97\xF1\x92\xB7(0\x82̂Ƃ\xAB\x8E\xA9\x93\xAE) @@ -60,10 +316,16 @@ if (wstr_len == 0) { wstr_len = wcslen(wstr_ptr) + 1; } - int len = ::WideCharToMultiByte(code_page, flags, + int len; + if (code_page == CP_UTF8) { + len = WideCharToUTF8(wstr_ptr, (DWORD)wstr_len, + NULL, 0); + } else { + len = ::WideCharToMultiByte(code_page, flags, wstr_ptr, (DWORD)wstr_len, NULL, 0, NULL, NULL); + } if (len == 0) { return NULL; } @@ -71,10 +333,15 @@ if (mb_ptr == NULL) { return NULL; } - len = ::WideCharToMultiByte(code_page, flags, - wstr_ptr, (DWORD)wstr_len, - mb_ptr, len, - NULL,NULL); + if (code_page == CP_UTF8) { + len = WideCharToUTF8(wstr_ptr, (DWORD)wstr_len, + mb_ptr, len); + } else { + len = ::WideCharToMultiByte(code_page, flags, + wstr_ptr, (DWORD)wstr_len, + mb_ptr, len, + NULL,NULL); + } if (len == 0) { free(mb_ptr); return NULL; @@ -106,9 +373,15 @@ if (str_len == 0) { str_len = strlen(str_ptr) + 1; } - int len = ::MultiByteToWideChar(code_page, flags, + int len; + if (code_page == CP_UTF8) { + len = UTF8ToWideChar(str_ptr, (int)str_len, + NULL, 0); + } else { + len = ::MultiByteToWideChar(code_page, flags, str_ptr, (int)str_len, NULL, 0); + } if (len == 0) { return NULL; } @@ -116,9 +389,14 @@ if (wstr_ptr == NULL) { return NULL; } - len = ::MultiByteToWideChar(code_page, flags, - str_ptr, (int)str_len, - wstr_ptr, len); + if (code_page == CP_UTF8) { + len = UTF8ToWideChar(str_ptr, (int)str_len, + wstr_ptr, len); + } else { + len = ::MultiByteToWideChar(code_page, flags, + str_ptr, (int)str_len, + wstr_ptr, len); + } if (len == 0) { free(wstr_ptr); return NULL; Modified: trunk/teraterm/common/codeconv.h =================================================================== --- trunk/teraterm/common/codeconv.h 2019-02-03 02:55:12 UTC (rev 7421) +++ trunk/teraterm/common/codeconv.h 2019-02-03 14:43:53 UTC (rev 7422) @@ -34,9 +34,14 @@ extern "C" { #endif + +// 1char +size_t UTF32ToUTF8(unsigned int u32, char *u8_ptr, size_t u8_len); +size_t UTF8ToUTF32(const char *u8_ptr_, size_t u8_len, unsigned int *u32_); + // MultiByteToWideChar() wrappers -// CP932(shift-jis) to wchar(UTF-16) -int CP932ToWideChar(const char *cp932_ptr, int cp932_len, wchar_t *wstr_ptr, int wstr_len); +int WideCharToUTF8(const wchar_t *wstr_ptr, int wstr_len, char *u8_ptr, int u8_len); +int UTF8ToWideChar(const char *u8_ptr, int u8_len, wchar_t *wstr_ptr, int wstr_len); // API wrappers char *_WideCharToMultiByte(const wchar_t *wstr_ptr, size_t wstr_len, int code_page, size_t *mb_len_);