Revision: 10763 https://osdn.net/projects/ttssh2/scm/svn/commits/10763 Author: zmatsuo Date: 2023-06-14 23:24:25 +0900 (Wed, 14 Jun 2023) Log Message: ----------- 受信文字コードがUTF-8の時の不正な文字の扱いを修正 - UTF-8として不正な文字の扱いを変更 - 修正前は常にISO8859-1 として扱っていた - FallbackToCP932=OFF時 - '?' を表示 - FallbackToCP932=ON時 - 日本語/UTF-8の時、可能ならShift_JISとして扱う - その他の場合は ISO8859-1 として扱う ticket #48226 Ticket Links: ------------ https://osdn.net/projects/ttssh2/tracker/detail/48226 Modified Paths: -------------- trunk/teraterm/teraterm/charset.c trunk/teraterm/teraterm/charset.h trunk/teraterm/teraterm/vtterm.c -------------- next part -------------- Modified: trunk/teraterm/teraterm/charset.c =================================================================== --- trunk/teraterm/teraterm/charset.c 2023-06-14 13:34:36 UTC (rev 10762) +++ trunk/teraterm/teraterm/charset.c 2023-06-14 14:24:25 UTC (rev 10763) @@ -42,9 +42,16 @@ #include "codeconv.h" #include "unicode.h" #include "language.h" // for JIS2SJIS() +#include "ttcstd.h" #include "charset.h" +// UTF-8\x82\xAA\x95s\x90\xB3\x82Ȓl\x82\xBE\x82\xC1\x82\xBD\x8E\x9E\x82ɕ\\x8E\xA6\x82\xB7\x82镶\x8E\x9A +#define REPLACEMENT_CHARACTER '?' +//#define REPLACEMENT_CHARACTER 0x2592 +//#define REPLACEMENT_CHARACTER 0x20 +//#define REPLACEMENT_CHARACTER 0xfffd + static BOOL KanjiIn; // TRUE = MBCS\x82\xCC1byte\x96ڂ\xF0\x8E\xF3\x90M\x82\xB5\x82Ă\xA2\x82\xE9 static BOOL EUCkanaIn, EUCsupIn; static int EUCcount; @@ -59,7 +66,7 @@ /* JIS -> SJIS conversion flag */ static BOOL ConvJIS; static WORD Kanji; -BOOL Fallbacked; +static BOOL Fallbacked; typedef struct { /* GL, GR code group */ @@ -66,6 +73,8 @@ int Glr[2]; /* G0, G1, G2, G3 code group */ int Gn[4]; + // + char32_t replacement_char; } VttermKanjiWork; static VttermKanjiWork KanjiWork; @@ -107,7 +116,11 @@ */ void CharSetInit(void) { - CharSetInit2(&KanjiWork); + VttermKanjiWork *w = &KanjiWork; + + CharSetInit2(w); + + w->replacement_char = REPLACEMENT_CHARACTER; SSflag = FALSE; KanjiIn = FALSE; @@ -140,7 +153,23 @@ assert(FALSE); return FALSE; } + /** + * Double-byte Character Sets + * SJIS\x82\xCC1byte\x96\xDA? + * + * \x91\xE61\x83o\x83C\x83g0x81...0x9F or 0xE0...0xEF + * \x91\xE61\x83o\x83C\x83g0x81...0x9F or 0xE0...0xFC + */ +static BOOL ismbbleadSJIS(BYTE b) +{ + if (((0x80<b) && (b<0xa0)) || ((0xdf<b) && (b<0xfd))) { + return TRUE; + } + return FALSE; +} + +/** * ts.Language == IdJapanese \x8E\x9E * 1byte\x96ڃ`\x83F\x83b\x83N */ @@ -278,7 +307,7 @@ } break; case IdUTF8: - PutChar('?'); + PutU32(REPLACEMENT_CHARACTER); break; default: ParseControl(b); @@ -293,7 +322,7 @@ } break; case IdUTF8: - PutChar('?'); + PutU32(REPLACEMENT_CHARACTER); break; default: ParseControl(b); @@ -499,7 +528,7 @@ } else if ((b>=0x20) && (b<=0x7E)) { PutU32(b); } else if ((b==0x8E) || (b==0x8F)) { - PutChar('?'); + PutU32(REPLACEMENT_CHARACTER); } else if ((b>=0x80) && (b<=0x9F)) { ParseControl(b); } else if (b>=0xA0) { @@ -507,11 +536,29 @@ } } +static void PutReplacementChr(VttermKanjiWork *w, const BYTE *ptr, size_t len) +{ + const char32_t replacement_char = w->replacement_char; + int i; + for (i = 0; i < len; i++) { + BYTE c = *ptr++; + if (c < 0x80) { + // \x95s\x90\xB3\x82\xC8UTF-8\x95\xB6\x8E\x9A\x97\xF1\x82̂Ȃ\xA9\x82\xC90x80\x96\xA2\x96\x9E\x82\xAA\x82\xA0\x82\xEA\x82A + // 1\x95\xB6\x8E\x9A\x82\xCCUTF-8\x95\xB6\x8E\x9A\x82Ƃ\xB5\x82Ă\xBB\x82̂܂ܕ\\x8E\xA6\x82\xB7\x82\xE9 + ParseASCII(c); + } + else { + PutU32(replacement_char); + } + } +} + // UTF-8\x82Ŏ\xF3\x90M\x83f\x81[\x83^\x82\xF0\x8F\x88\x97\x9D\x82\xB7\x82\xE9 // returns TRUE if b is processed // (actually allways returns TRUE) static BOOL ParseFirstUTF8(BYTE b) { + VttermKanjiWork *w = &KanjiWork; static BYTE buf[4]; static int count = 0; @@ -518,10 +565,19 @@ unsigned int code; int i; - if (ts.FallbackToCP932 && Fallbacked) { - return ParseFirstJP(b); + if (Fallbacked) { + BOOL r = ParseFirstJP(b); + Fallbacked = FALSE; + return r; } + if (b < 0x20) { + PutReplacementChr(w, buf, count); + count = 0; + ParseASCII(b); + return TRUE; + } + // UTF-8\x83G\x83\x93\x83R\x81[\x83h // Unicode 1byte, 2byte, 3byte, 4byte // U+0000 ... U+007f 0x00 .. 0x7f @@ -536,7 +592,7 @@ // - 2byte\x96ڈȍ~ // - 0x00 - 0x7f // - 0xc0 - 0xff - +recheck: // 1byte(7bit) if (count == 0) { if ((b & 0x80) == 0x00) { @@ -546,10 +602,29 @@ return TRUE; } if ((b & 0x40) == 0x00 || b >= 0xf6 ) { - // UTF-8\x82\xC51byte\x82ɏo\x8C\xBB\x82\xB5\x82Ȃ\xA2\x83R\x81[\x83h\x82̂Ƃ\xAB\x81A\x82\xBB\x82̂܂o\x97\xCD + // UTF-8\x82\xC51byte\x82ɏo\x8C\xBB\x82\xB5\x82Ȃ\xA2\x83R\x81[\x83h\x82̂Ƃ\xAB // 0x40 = 0b1011_1111, 0b10xx_xxxx\x82Ƃ\xA2\x82\xA4bit\x83p\x83^\x81[\x83\x93\x82ɂ͂Ȃ\xE7\x82Ȃ\xA2 // 0xf6 \x88ȏ\xE3\x82̂Ƃ\xAB U+10FFFF\x82\xE6\x82\xE8\x91傫\x82\xAD\x82Ȃ\xE9 - PutU32(b); + if (ts.FallbackToCP932) { + // fallback\x82\xB7\x82\xE9\x8Fꍇ + if ((ts.Language == IdJapanese) && ismbbleadSJIS(b)) { + // \x93\xFA\x96{\x8C\xEA\x82̏ꍇ && Shift_JIS 1byte\x96\xDA + // Shift_JIS \x82\xC9 fallback + Fallbacked = TRUE; + ConvJIS = FALSE; + Kanji = b << 8; + KanjiIn = TRUE; + return TRUE; + } + // fallback ISO8859-1 + PutU32(b); + return TRUE; + } + else { + // fallback\x82\xB5\x82Ȃ\xA2, \x95s\x90\xB3\x82ȕ\xB6\x8E\x9A\x93\xFC\x97\xCD + buf[0] = b; + PutReplacementChr(w, buf, 1); + } return TRUE; } // 1byte\x96ڕۑ\xB6 @@ -569,11 +644,16 @@ } } if (code == 0){ - // \x82\xBB\x82̂܂o\x97\xCD - PutU32(buf[0]); - PutU32(b); + if (ts.FallbackToCP932) { + // fallback ISO8859-1 + PutU32(buf[0]); + } + else { + buf[1] = b; + PutReplacementChr(w, buf, 1); + } count = 0; - return TRUE; + goto recheck; } else { PutU32(code); @@ -602,12 +682,16 @@ } } if (code == 0) { - // \x82\xBB\x82̂܂o\x97\xCD - PutU32(buf[0]); - PutU32(buf[1]); - PutU32(buf[2]); + if (ts.FallbackToCP932) { + // fallback ISO8859-1 + PutU32(buf[0]); + PutU32(buf[1]); + } + else { + PutReplacementChr(w, buf, 2); + } count = 0; - return TRUE; + goto recheck; } else { PutU32(code); count = 0; @@ -633,13 +717,17 @@ } } if (code == 0) { - // \x82\xBB\x82̂܂o\x97\xCD - PutU32(buf[0]); - PutU32(buf[1]); - PutU32(buf[2]); - PutU32(buf[3]); + if (ts.FallbackToCP932) { + // fallback ISO8859-1 + PutU32(buf[0]); + PutU32(buf[1]); + PutU32(buf[2]); + } + else { + PutReplacementChr(w, buf, 3); + } count = 0; - return TRUE; + goto recheck; } else { PutU32(code); count = 0; @@ -862,3 +950,13 @@ w->Gn[i] = state->infos[2 + i]; } } + +/** + * \x83t\x83H\x81[\x83\x8B\x83o\x83b\x83N\x82̏I\x97\xB9 + * \x8E\xF3\x90M\x83f\x81[\x83^UTF-8\x8E\x9E\x82ɁAShift_JIS\x8Fo\x97͒\x86(fallback\x8F\xF3\x91\xD4)\x82𒆒f\x82\xB7\x82\xE9 + * + */ +void CharSetFallbackFinish(void) +{ + Fallbacked = FALSE; +} Modified: trunk/teraterm/teraterm/charset.h =================================================================== --- trunk/teraterm/teraterm/charset.h 2023-06-14 13:34:36 UTC (rev 10762) +++ trunk/teraterm/teraterm/charset.h 2023-06-14 14:24:25 UTC (rev 10763) @@ -30,7 +30,7 @@ int infos[6]; } CharSetState; -extern BOOL Fallbacked; +//extern BOOL Fallbacked; // input void ParseFirst(BYTE b); @@ -48,3 +48,4 @@ BOOL CharSetIsSpecial(BYTE b); void CharSetSaveState(CharSetState *state); void CharSetLoadState(const CharSetState *state); +void CharSetFallbackFinish(void); Modified: trunk/teraterm/teraterm/vtterm.c =================================================================== --- trunk/teraterm/teraterm/vtterm.c 2023-06-14 13:34:36 UTC (rev 10762) +++ trunk/teraterm/teraterm/vtterm.c 2023-06-14 14:24:25 UTC (rev 10763) @@ -654,7 +654,7 @@ else if (CursorX < CursorLeftM) MoveCursor(0, CursorY); - Fallbacked = FALSE; + CharSetFallbackFinish(); } void LineFeed(BYTE b, BOOL logFlag) @@ -677,7 +677,7 @@ if (LFMode) CarriageReturn(logFlag); - Fallbacked = FALSE; + CharSetFallbackFinish(); } static void Tab(void) @@ -2079,7 +2079,7 @@ else MoveCursor(CursorX,Param[1]-1); } - Fallbacked = FALSE; + CharSetFallbackFinish(); } static void CSMoveToXY() // CUP / HVP @@ -2110,7 +2110,7 @@ } MoveCursor(NewX, NewY); - Fallbacked = FALSE; + CharSetFallbackFinish(); } static void CSDeleteTabStop()