Revision: 10325 https://osdn.net/projects/ttssh2/scm/svn/commits/10325 Author: nmaya Date: 2022-10-18 22:01:25 +0900 (Tue, 18 Oct 2022) Log Message: ----------- 同じ Unicode block にある文字だけを、ヴィラーマによって結合するようにした Unicode block のテーブルを作成するスクリプトを追加した ticket #44424 Ticket Links: ------------ https://osdn.net/projects/ttssh2/tracker/detail/44424 Modified Paths: -------------- trunk/teraterm/teraterm/buffer.c trunk/teraterm/teraterm/unicode/readme.md trunk/teraterm/teraterm/unicode.cpp trunk/teraterm/teraterm/unicode.h Added Paths: ----------- trunk/teraterm/teraterm/unicode/get_block_table.md trunk/teraterm/teraterm/unicode/get_block_table.pl -------------- next part -------------- Modified: trunk/teraterm/teraterm/buffer.c =================================================================== --- trunk/teraterm/teraterm/buffer.c 2022-10-17 15:19:42 UTC (rev 10324) +++ trunk/teraterm/teraterm/buffer.c 2022-10-18 13:01:25 UTC (rev 10325) @@ -2740,7 +2740,16 @@ // \x83\x94\x83B\x83\x89\x81[\x83}\x8F\x88\x97\x9D if (UnicodeIsVirama(p->u32_last) != 0) { - return p; + // 1\x82O\x82̃\x94\x83B\x83\x89\x81[\x83}\x82Ɠ\xAF\x82\xB6 block \x82̕\xB6\x8E\x9A\x82ł\xA0\x82\xE9 + int block_index_last = UnicodeBlockIndex(p->u32_last); + int block_index = UnicodeBlockIndex(u32); +#if 0 + OutputDebugPrintf("U+%06x, %d, %s\n", p->u32_last, block_index_last, UnicodeBlockName(block_index_last)); + OutputDebugPrintf("U+%06x, %d, %s\n", u32, block_index, UnicodeBlockName(block_index)); +#endif + if (block_index_last == block_index) { + return p; + } } return NULL; } Added: trunk/teraterm/teraterm/unicode/get_block_table.md =================================================================== --- trunk/teraterm/teraterm/unicode/get_block_table.md (rev 0) +++ trunk/teraterm/teraterm/unicode/get_block_table.md 2022-10-18 13:01:25 UTC (rev 10325) @@ -0,0 +1,16 @@ +# unicode の block + +- 元情報 unicode.org + - https://www.unicode.org/reports/tr44/#Blocks.txt + +# テーブルの作り方 + +- Blocks.txt をダウンロード +- スクリプトを実行 +- unicode_block.tbl が出力される + +実行例 +``` +wget https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt +perl get_block_table.pl +``` Added: trunk/teraterm/teraterm/unicode/get_block_table.pl =================================================================== --- trunk/teraterm/teraterm/unicode/get_block_table.pl (rev 0) +++ trunk/teraterm/teraterm/unicode/get_block_table.pl 2022-10-18 13:01:25 UTC (rev 10325) @@ -0,0 +1,34 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use utf8; + +my $src_file = "Blocks.txt"; +my $fname_out = "unicode_block.tbl"; + +open(FILE, $src_file) || die "Cannot open $src_file."; +open(OUT, ">:crlf:utf8", $fname_out) || die "Cannot open $fname_out."; +print OUT "// this file was generated by get_block_table.pl\n"; + +my $v = <FILE>; +chop($v); +print OUT "// $v\n"; +$v = <FILE>; +chop($v); +print OUT "// $v\n"; + +my $ostart = -1; +my $otype = ""; +my $oend = 0; +my $type; +while(my $a = <FILE>) { + if ($a =~ /^([0-9A-F]+)\.\.([0-9A-F]+); ([- 0-9A-Za-z]+)/) { + my $start = hex $1; + my $end = hex $2; + my $name = $3; + printf(OUT "{ 0x%06x, 0x%06x, \"$name\" },\n", $start, $end); + } else { + next; + } +} Modified: trunk/teraterm/teraterm/unicode/readme.md =================================================================== --- trunk/teraterm/teraterm/unicode/readme.md 2022-10-17 15:19:42 UTC (rev 10324) +++ trunk/teraterm/teraterm/unicode/readme.md 2022-10-18 13:01:25 UTC (rev 10325) @@ -44,6 +44,11 @@ - 絵文字判定のためのテーブル - [get_emoji_table.md](get_emoji_table.md) +## [unicode_block.tbl](../unicode_block.tbl) + +- Unicode block のテーブル +- [get_block_table.md](get_block_table.md) + ## iso8859-X.md - [iso8859.md](iso8859.md) Modified: trunk/teraterm/teraterm/unicode.cpp =================================================================== --- trunk/teraterm/teraterm/unicode.cpp 2022-10-17 15:19:42 UTC (rev 10324) +++ trunk/teraterm/teraterm/unicode.cpp 2022-10-18 13:01:25 UTC (rev 10325) @@ -100,6 +100,16 @@ unsigned char category; } UnicodeTableCombine_t; +typedef struct { + unsigned long code_from; + unsigned long code_to; + char *block_name; +} UnicodeTableBlock_t; + +const UnicodeTableBlock_t UnicodeBlockList[] = { +#include "unicode_block.tbl" +}; + /** * u32\x82\xAA\x83e\x81[\x83u\x83\x8B\x82̃f\x81[\x83^\x82Ɋ܂܂\xEA\x82Ă\xA2\x82邩\x92\xB2\x82ׂ\xE9 * @@ -165,6 +175,39 @@ return -1; } +/** + * SearchTableSimple() \x82Ɠ\xAF\x82\xB6 + * \x83e\x81[\x83u\x83\x8B\x82̌^\x82\xAA\x88قȂ\xE9 + * + * @retval \x83e\x81[\x83u\x83\x8B\x82\xCCindex + * @retval -1 \x83e\x81[\x83u\x83\x8B\x82ɑ\xB6\x8D݂\xB5\x82Ȃ\xA2 + */ +static int SearchTableBlock( + const UnicodeTableBlock_t *table, size_t table_size, + unsigned long u32) +{ + if (u32 < table[0].code_from) { + return -1; + } + if (u32 > table[table_size-1].code_to) { + return -1; + } + size_t low = 0; + size_t high = table_size - 1; + while (low <= high) { + size_t mid = (low + high) / 2; + if (table[mid].code_from <= u32 && u32 <= table[mid].code_to) { + return (int)mid; + } else if (table[mid].code_to < u32) { + low = mid + 1; + } else { + high = mid - 1; + } + } + // \x83e\x81[\x83u\x83\x8B\x82͈̔͊O + return -1; +} + /* * \x8C\x8B\x8D\x87\x95\xB6\x8E\x9A\x82\xA9\x8C\x9F\x8D\xB8\x82\xB7\x82\xE9 * \x8E\x9F\x82̕\xB6\x8E\x9A\x82\xE0\x8C\x8B\x8D\x87\x95\xB6\x8E\x9A\x82Ƃ\xB5\x82Ĉ\xB5\x82\xA4 @@ -244,7 +287,25 @@ return index != -1 ? 1 : 0; } +/** + * Unicode block \x82\xCC index \x82\xE9 + * + * @retval -1 block \x82\xAA\x8C\xA9\x82\xA9\x82\xE7\x82Ȃ\xA2 + * @retval block \x82\xCC index + */ +int UnicodeBlockIndex(unsigned long u32) +{ + return SearchTableBlock(UnicodeBlockList, _countof(UnicodeBlockList), u32); +} +char *UnicodeBlockName(int index) +{ + if (index == -1) { + return ""; + } + return UnicodeBlockList[index].block_name; +} + #if 0 int main(int, char *[]) { Modified: trunk/teraterm/teraterm/unicode.h =================================================================== --- trunk/teraterm/teraterm/unicode.h 2022-10-17 15:19:42 UTC (rev 10324) +++ trunk/teraterm/teraterm/unicode.h 2022-10-18 13:01:25 UTC (rev 10325) @@ -40,6 +40,8 @@ int UnicodeFromISO8859(int part, unsigned char b, unsigned short *u16); int UnicodeToISO8859(int part, unsigned long u32, unsigned char *b); int UnicodeIsVirama(unsigned long u32); +int UnicodeBlockIndex(unsigned long u32); +char *UnicodeBlockName(int); #ifdef __cplusplus }