[Ttssh2-commit] [7422] UTF8,UTF16間の変換を自分で行うようにした

Back to archive index
scmno****@osdn***** scmno****@osdn*****
2019年 2月 3日 (日) 23:43:53 JST


Revision: 7422
          http://sourceforge.jp/projects/ttssh2/scm/svn/commits/7422
Author:   zmatsuo
Date:     2019-02-03 23:43:53 +0900 (Sun, 03 Feb 2019)
Log Message:
-----------
UTF8,UTF16間の変換を自分で行うようにした
(Windows95対策)

Modified Paths:
--------------
    trunk/teraterm/common/codeconv.cpp
    trunk/teraterm/common/codeconv.h

-------------- next part --------------
Modified: trunk/teraterm/common/codeconv.cpp
===================================================================
--- trunk/teraterm/common/codeconv.cpp	2019-02-03 02:55:12 UTC (rev 7421)
+++ trunk/teraterm/common/codeconv.cpp	2019-02-03 14:43:53 UTC (rev 7422)
@@ -29,20 +29,276 @@
 #include <windows.h>
 #include <string.h>
 #include <crtdbg.h>
-
+#if (defined(_MSC_VER) && (_MSC_VER >= 1600)) || !defined(_MSC_VER)
+#include <stdint.h>
+#endif
 #include "codeconv.h"
 
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+typedef unsigned char	uint8_t;
+typedef unsigned short  uint16_t;
+typedef unsigned int	uint32_t;
+#endif
+
 #ifdef _DEBUG
 #define malloc(l)     _malloc_dbg((l), _NORMAL_BLOCK, __FILE__, __LINE__)
 #define free(p)       _free_dbg((p), _NORMAL_BLOCK)
+#define _strdup(s)	  _strdup_dbg((s), _NORMAL_BLOCK, __FILE__, __LINE__)
+#define _wcsdup(s)    _wcsdup_dbg((s), _NORMAL_BLOCK, __FILE__, __LINE__)
 #endif
 
-int CP932ToWideChar(const char *cp932_ptr, int cp932_len, wchar_t *wstr_ptr, int wstr_len)
+/**
+ * UTF-32 \x82\xA9\x82\xE7 UTF-8 \x82֕ϊ\xB7\x82\xB7\x82\xE9
+ * @param[in]		u32		\x95ϊ\xB7\x82\xB7\x82\xE9UTF-32
+ * @param[in,out]	u8_ptr	\x95ϊ\xB7\x8C\xE3UTF-8\x95\xB6\x8E\x9A\x97\xF1\x8Fo\x97͐\xE6(NULL\x82̂Ƃ\xAB\x8Fo\x97͂\xB5\x82Ȃ\xA2)
+ * @param[in]		u8_len	UTF-8\x8Fo\x97͐敶\x8E\x9A\x90\x94(\x83o\x83b\x83t\x83@\x92\xB7,byte\x90\x94)
+ * @retval			\x8Eg\x97p\x82\xB5\x82\xBDutf8\x95\xB6\x8E\x9A\x90\x94(byte\x90\x94\x81j
+ *					0=\x83G\x83\x89\x81[
+ */
+size_t UTF32ToUTF8(uint32_t u32, char *u8_ptr_, size_t u8_len)
 {
-	return MultiByteToWideChar(932, 0, cp932_ptr, cp932_len, wstr_ptr, wstr_len);
+	size_t out_len = 0;
+	uint8_t *u8_ptr = (uint8_t *)u8_ptr_;
+	if (u8_ptr != NULL) {
+		u8_len = 4;
+	}
+
+	if (u32 <= 0x0000007f) {
+		// 0x00000000 <= u32 <= 0x0000007f
+		if (u8_len >= 1) {
+			if (u8_ptr != NULL) {
+				u8_ptr[0] = (uint8_t)u32;
+			}
+			out_len = 1;
+		}
+	} else if (u32 <= 0x000007ff) {
+		// 0x00000080 <= u32 <= 0x000007ff
+		if (u8_len >= 2) {
+			if (u8_ptr != NULL) {
+				u8_ptr[0] = ((u32 >> 6) & 0x1f) | 0xc0;
+				u8_ptr[1] = (u32 & 0x3f) | 0x80;
+			}
+			out_len = 2;
+		}
+	} else if (u32 <= 0x0000ffff) {
+		// 0x00000800 <= u32 <= 0x0000ffff
+		if (u8_len >= 3) {
+			if (u8_ptr != NULL) {
+				u8_ptr[0] = ((u32 >> 12) & 0xf) | 0xe0;
+				u8_ptr[1] = ((u32 >> 6) & 0x3f) | 0x80;
+				u8_ptr[2] = (u32 & 0x3f) | 0x80;
+			}
+			out_len = 3;
+		}
+	} else if (u32 <= 0x0010ffff) {
+		// 0x00010000 <= u32 <= 0x0010ffff
+		if (u8_len >= 4) {
+			if (u8_ptr != NULL) {
+				u8_ptr[0] = ((uint8_t)(u32 >> 18)) | 0xf0;
+				u8_ptr[1] = ((u32 >> 12) & 0x3f) | 0x80;
+				u8_ptr[2] = ((u32 >> 6) & 0x3f) | 0x80;
+				u8_ptr[3] = (u32 & 0x3f) | 0x80;
+			}
+			out_len = 4;
+		}
+	} else {
+		out_len = 0;
+	}
+	return out_len;
 }
 
 /**
+ * UTF-8\x95\xB6\x8E\x9A\x97񂩂\xE7UTF-32\x82\xF01\x95\xB6\x8E\x9A\x8E\xE6\x82\xE8\x8Fo\x82\xB7
+ * @param[in]	u8_ptr	UTF-8\x95\xB6\x8E\x9A\x97\xF1\x82ւ̃|\x83C\x83\x93\x83^
+ * @param[in]	u8_len	UTF-8\x95\xB6\x8E\x9A\x97񒷂\xB3
+ * @param[out]	u32		\x95ϊ\xB7\x82\xB5\x82\xBDUTF-32\x95\xB6\x8E\x9A
+ * @retval		\x8Eg\x97p\x82\xB5\x82\xBDUTF-8\x95\xB6\x8E\x9A\x90\x94(byte\x90\x94\x81j
+ *				0=\x83G\x83\x89\x81[
+ */
+size_t UTF8ToUTF32(const char *u8_ptr_, size_t u8_len, uint32_t *u32_)
+{
+	uint8_t *u8_ptr = (uint8_t *)u8_ptr_;
+	uint32_t u32;
+	size_t u8_in;
+	const uint8_t c1 = *u8_ptr++;
+    if (c1 <= 0x7f) {
+		// 1byte
+		if (u8_len >= 1) {
+			u32 = (uint32_t)c1;
+			u8_in = 1;
+		} else {
+			goto error;
+		}
+	} else if (0xc2 <= c1 && c1 <= 0xdf) {
+		// 2byte
+		if (u8_len >= 2) {
+			const uint8_t c2 = *u8_ptr++;
+			if (((c1 & 0x1e) != 0) &&
+				((c2 & 0xc0) == 0x80))
+			{
+				u32 = (uint32_t)((c1 & 0x1f) << 6) + (c2 & 0x3f);
+				u8_in = 2;
+			} else {
+				goto error;
+			}
+		} else {
+			goto error;
+		}
+	} else if (0xe0 <= c1 && c1 <= 0xef) {
+		// 3byte
+		if (u8_len >= 3) {
+			const uint8_t c2 = *u8_ptr++;
+			const uint8_t c3 = *u8_ptr++;
+			if ((((c1 & 0x0f) != 0) || ((c2 & 0x20) != 0)) &&
+				((c2 & 0xc0) == 0x80) &&
+				((c3 & 0xc0) == 0x80) )
+			{
+				u32 = (uint32_t)((c1 & 0x0f) << 12) + ((c2 & 0x3f) << 6);
+				u32 += (c3 & 0x3f);
+				u8_in = 3;
+			} else {
+				goto error;
+			}
+		} else {
+			goto error;
+		}
+	} else if (0xf0 <= c1 && c1 <= 0xf7 && u8_len >= 4) {
+		// 4byte
+		if (u8_len >= 4) {
+			const uint8_t c2 = *u8_ptr++;
+			const uint8_t c3 = *u8_ptr++;
+			const uint8_t c4 = *u8_ptr++;
+			if ((((c1 & 0x07) != 0) || ((c2 & 0x30) != 0)) &&
+				((c2 & 0xc0) == 0x80) &&
+				((c3 & 0xc0) == 0x80) &&
+				((c4 & 0xc0) == 0x80) )
+			{
+				u32 = (uint32_t)((c1 & 0x07) << 18) + ((c2 & 0x3f) << 12);
+				u32 += ((c3 & 0x3f) << 6) + (c4 & 0x3f);
+				u8_in = 4;
+			} else {
+				goto error;
+			}
+		} else {
+			goto error;
+		}
+    } else {
+	error:
+		u32 = 0;
+		u8_in = 0;
+	}
+	*u32_ = u32;
+	return u8_in;
+}
+
+// WideCharToMultiByte\x82\xCCUTF8\x93\xC1\x89\xBB\x94\xC5
+int WideCharToUTF8(const wchar_t *wstr_ptr, int wstr_len, char *u8_ptr, int u8_len)
+{
+	int u8_out_sum = 0;
+	if (u8_ptr == NULL) {
+		u8_len = 4;
+	} else {
+		if (u8_len == 0) {
+			return 0;
+		}
+	}
+	if (wstr_len < 0) {
+		wstr_len = (int)wcslen(wstr_ptr) + 1;
+	}
+
+	while(u8_len > 0 && wstr_len > 0) {
+		const wchar_t u16 = *wstr_ptr++;
+		uint32_t u32 = u16;
+		size_t u8_out;
+		wstr_len--;
+		// \x83T\x83\x8D\x83Q\x81[\x83g high?
+		if (0xd800 <= u16 && u16 < 0xdc00) {
+			if (wstr_len >= 1) {
+				const wchar_t u16_lo = *wstr_ptr++;
+				wstr_len--;
+				// \x83T\x83\x8D\x83Q\x81[\x83g low?
+				if (0xdc00 <= u16_lo && u16_lo < 0xe000) {
+					// \x83T\x83\x8D\x83Q\x81[\x83g\x83y\x83A \x83f\x83R\x81[\x83h
+					u32 = 0x10000 + (u16 - 0xd800) * 0x400 + (u16_lo - 0xdc00);
+				} else {
+					goto unknown_code;
+				}
+			} else {
+			unknown_code:
+				if (u8_ptr != NULL) {
+					*u8_ptr++ = '?';
+				}
+				u8_out = 1;
+				goto loop_next;
+			}
+		}
+		u8_out = UTF32ToUTF8(u32, u8_ptr, u8_len);
+		if (u8_out == 0) {
+			goto unknown_code;
+		}
+	loop_next:
+		u8_out_sum += u8_out;
+		if (u8_ptr != NULL) {
+			u8_ptr += u8_out;
+			u8_len -= u8_out;
+		}
+	}
+	return u8_out_sum;
+}
+
+// MultiByteToWideChar\x82\xCCUTF8\x93\xC1\x89\xBB\x94\xC5
+int UTF8ToWideChar(const char *u8_ptr, int u8_len, wchar_t *wstr_ptr, int wstr_len)
+{
+	size_t u16_out_sum = 0;
+	if (u8_len < 0) {
+		u8_len = strlen(u8_ptr) + 1;
+	}
+	if (wstr_ptr == NULL) {
+		wstr_len = 1;
+	}
+
+	while(wstr_len > 0 && u8_len > 0) {
+		uint32_t u32;
+		size_t u16_out;
+		size_t u8_in = UTF8ToUTF32(u8_ptr, u8_len, &u32);
+		if (u8_in == 0) {
+			u32 = '?';
+			u8_in = 1;
+		}
+		u8_ptr += u8_in;
+		u8_len -= u8_in;
+
+		if (u32 < 0x10000) {
+			if (wstr_ptr != NULL) {
+				*wstr_ptr++ = (uint16_t)u32;
+			}
+			u16_out = 1;
+		} else if (u32 <= 0x10ffff) {
+			if (wstr_len > 2) {
+				if (wstr_ptr != NULL) {
+					// \x83T\x83\x8D\x83Q\x81[\x83g \x83G\x83\x93\x83R\x81[\x83h
+					*wstr_ptr++ = uint16_t((u32 - 0x10000) / 0x400) + 0xd800;
+					*wstr_ptr++ = uint16_t((u32 - 0x10000) % 0x400) + 0xdc00;
+				}
+				u16_out = 2;
+			} else {
+				*wstr_ptr++ = '?';
+				u16_out = 1;
+			}
+		} else {
+			*wstr_ptr++ = '?';
+			u16_out = 1;
+		}
+
+		if (wstr_ptr != NULL) {
+			wstr_len -= u16_out;
+		}
+		u16_out_sum += u16_out;
+	}
+	return u16_out_sum;
+}
+
+/**
  *	wchar_t\x95\xB6\x8E\x9A\x97\xF1\x82\xF0\x83}\x83\x8B\x83`\x83o\x83C\x83g\x95\xB6\x8E\x9A\x97\xF1\x82֕ϊ\xB7
  *	@param[in]	*wstr_ptr	wchar_t\x95\xB6\x8E\x9A\x97\xF1
  *	@param[in]	wstr_len	wchar_t\x95\xB6\x8E\x9A\x97\xF1\x92\xB7(0\x82̂Ƃ\xAB\x8E\xA9\x93\xAE)
@@ -60,10 +316,16 @@
 	if (wstr_len == 0) {
 		wstr_len = wcslen(wstr_ptr) + 1;
 	}
-    int len = ::WideCharToMultiByte(code_page, flags,
+    int len;
+	if (code_page == CP_UTF8) {
+		len = WideCharToUTF8(wstr_ptr, (DWORD)wstr_len,
+							 NULL, 0);
+	} else {
+		len = ::WideCharToMultiByte(code_page, flags,
 									wstr_ptr, (DWORD)wstr_len,
 									NULL, 0,
 									NULL, NULL);
+	}
 	if (len == 0) {
 		return NULL;
 	}
@@ -71,10 +333,15 @@
 	if (mb_ptr == NULL) {
 		return NULL;
 	}
-	len = ::WideCharToMultiByte(code_page, flags,
-								wstr_ptr, (DWORD)wstr_len,
-								mb_ptr, len,
-								NULL,NULL);
+	if (code_page == CP_UTF8) {
+		len = WideCharToUTF8(wstr_ptr, (DWORD)wstr_len,
+							 mb_ptr, len);
+	} else {
+		len = ::WideCharToMultiByte(code_page, flags,
+									wstr_ptr, (DWORD)wstr_len,
+									mb_ptr, len,
+									NULL,NULL);
+	}
 	if (len == 0) {
 		free(mb_ptr);
 		return NULL;
@@ -106,9 +373,15 @@
 	if (str_len == 0) {
 		str_len = strlen(str_ptr) + 1;
 	}
-	int len = ::MultiByteToWideChar(code_page, flags,
+	int len;
+	if (code_page == CP_UTF8) {
+		len = UTF8ToWideChar(str_ptr, (int)str_len,
+							 NULL, 0);
+	} else {
+		len = ::MultiByteToWideChar(code_page, flags,
 									str_ptr, (int)str_len,
 									NULL, 0);
+	}
 	if (len == 0) {
 		return NULL;
 	}
@@ -116,9 +389,14 @@
 	if (wstr_ptr == NULL) {
 		return NULL;
 	}
-	len = ::MultiByteToWideChar(code_page, flags,
-								str_ptr, (int)str_len,
-								wstr_ptr, len);
+	if (code_page == CP_UTF8) {
+		len = UTF8ToWideChar(str_ptr, (int)str_len,
+							 wstr_ptr, len);
+	} else {
+		len = ::MultiByteToWideChar(code_page, flags,
+									str_ptr, (int)str_len,
+									wstr_ptr, len);
+	}
 	if (len == 0) {
 		free(wstr_ptr);
 		return NULL;

Modified: trunk/teraterm/common/codeconv.h
===================================================================
--- trunk/teraterm/common/codeconv.h	2019-02-03 02:55:12 UTC (rev 7421)
+++ trunk/teraterm/common/codeconv.h	2019-02-03 14:43:53 UTC (rev 7422)
@@ -34,9 +34,14 @@
 extern "C" {
 #endif
 
+
+// 1char
+size_t UTF32ToUTF8(unsigned int u32, char *u8_ptr, size_t u8_len);
+size_t UTF8ToUTF32(const char *u8_ptr_, size_t u8_len, unsigned int *u32_);
+
 // MultiByteToWideChar() wrappers
-//	CP932(shift-jis) to wchar(UTF-16)
-int CP932ToWideChar(const char *cp932_ptr, int cp932_len, wchar_t *wstr_ptr, int wstr_len);
+int WideCharToUTF8(const wchar_t *wstr_ptr, int wstr_len, char *u8_ptr, int u8_len);
+int UTF8ToWideChar(const char *u8_ptr, int u8_len, wchar_t *wstr_ptr, int wstr_len);
 
 // API wrappers
 char *_WideCharToMultiByte(const wchar_t *wstr_ptr, size_t wstr_len, int code_page, size_t *mb_len_);


Ttssh2-commit メーリングリストの案内
Back to archive index