Kouhei Sutou 2018-11-05 14:20:06 +0900 (Mon, 05 Nov 2018) Revision: 3403b7ce9b9b5289f9c2ad181dac2312c5c32d59 https://github.com/groonga/groonga/commit/3403b7ce9b9b5289f9c2ad181dac2312c5c32d59 Message: NormalierNFKC100: unify after normalize It's changed from unifying while normalizing. The previous approach doesn't support a case that changes the number of characters. Modified files: lib/normalizer.c Modified: lib/normalizer.c (+246 -93) =================================================================== --- lib/normalizer.c 2018-11-05 14:19:03 +0900 (a6a080be8) +++ lib/normalizer.c 2018-11-05 14:20:06 +0900 (baf44b1aa) @@ -573,7 +573,7 @@ typedef struct { unsigned char *d; unsigned char *d_; unsigned char *de; - uint_least8_t *cp; + uint8_t *cp; uint64_t *offsets; size_t length; size_t ls; @@ -678,7 +678,7 @@ grn_nfkc_normalize_expand(grn_ctx *ctx, data->string->checks = checks; } if (data->cp) { - uint_least8_t *ctypes; + uint8_t *ctypes; if (!(ctypes = GRN_REALLOC(data->string->ctypes, data->ds + 1))) { ERR(GRN_NO_MEMORY_AVAILABLE, "[normalize][nfkc] failed to expand character types space"); @@ -1126,104 +1126,269 @@ grn_nfkc_normalize_unify_katakana_bu_sound(const unsigned char *utf8_char, return GRN_FALSE; } -grn_inline static grn_char_type +static void grn_nfkc_normalize_unify(grn_ctx *ctx, - grn_nfkc_normalize_data *data, - grn_char_type char_type) + grn_nfkc_normalize_data *data) { - if (data->options->unify_kana && - char_type == GRN_CHAR_KATAKANA && - data->lp == 3) { - data->p = grn_nfkc_normalize_unify_kana(data->p, data->unified_kana); - if (data->p == data->unified_kana) { - char_type = GRN_CHAR_HIRAGANA; + const unsigned char *current = data->string->normalized; + const unsigned char *end = data->d; + size_t i_byte; + size_t n_bytes = end - current; + size_t i_character; + unsigned char *unified = NULL; + unsigned char *unified_end = NULL; + unsigned char *unified_previous = NULL; + unsigned char *unified_current; + uint8_t *unified_char_types = NULL; + uint8_t *unified_char_types_current = NULL; + int16_t *unified_checks = NULL; + int16_t *unified_checks_current = NULL; + uint64_t *unified_offsets = NULL; + uint64_t *unified_offsets_current = NULL; + unsigned int unified_n_characters = 0; + size_t unified_data_size = n_bytes; + + if (!(data->options->unify_kana || + data->options->unify_kana_case || + data->options->unify_kana_voiced_sound_mark || + data->options->unify_hyphen || + data->options->unify_prolonged_sound_mark || + data->options->unify_hyphen_and_prolonged_sound_mark || + data->options->unify_middle_dot || + data->options->unify_katakana_v_sounds || + data->options->unify_katakana_bu_sound || + data->options->unify_hyphen)) { + return; + } + + unified = GRN_MALLOC(unified_data_size + 1); + if (!unified) {goto exit;} + unified_end = unified + unified_data_size; + unified_current = unified; + + if (data->ch) { + unified_checks = GRN_MALLOC(sizeof(int16_t) * (unified_data_size + 1)); + if (!unified_checks) { + goto exit; } + unified_checks_current = unified_checks; + } + if (data->cp) { + unified_char_types = GRN_MALLOC(sizeof(uint8_t) * (unified_data_size + 1)); + if (!unified_char_types) { + goto exit; + } + unified_char_types_current = unified_char_types; } - if (data->options->unify_kana_case) { - switch (char_type) { - case GRN_CHAR_HIRAGANA : - if (data->lp == 3) { - data->p = grn_nfkc_normalize_unify_hiragana_case( - data->p, data->unified_kana_case); + if (data->offsets) { + unified_offsets = GRN_MALLOC(sizeof(uint64_t) * (unified_data_size + 1)); + if (!unified_char_types) { + goto exit; + } + unified_offsets_current = unified_offsets; + } + + i_byte = 0; + i_character = 0; + while (current < end) { + const unsigned char *unifying = current; + size_t char_length; + size_t unified_char_length; + grn_char_type char_type; + grn_bool skip = GRN_FALSE; + + char_length = grn_charlen_(ctx, current, end, GRN_ENC_UTF8); + unified_char_length = char_length; + + if (data->cp) { + char_type = data->string->ctypes[i_character]; + } else { + char_type = data->options->char_type_func(current); + } + + if (data->options->unify_kana && + char_type == GRN_CHAR_KATAKANA && + unified_char_length == 3) { + unifying = grn_nfkc_normalize_unify_kana(unifying, data->unified_kana); + if (unifying == data->unified_kana) { + char_type = GRN_CHAR_HIRAGANA; } - break; - case GRN_CHAR_KATAKANA : - if (data->lp == 3) { - data->p = grn_nfkc_normalize_unify_katakana_case( - data->p, data->unified_kana_case); + } + + if (data->options->unify_kana_case) { + switch (char_type) { + case GRN_CHAR_HIRAGANA : + if (unified_char_length == 3) { + unifying = grn_nfkc_normalize_unify_hiragana_case( + unifying, data->unified_kana_case); + } + break; + case GRN_CHAR_KATAKANA : + if (unified_char_length == 3) { + unifying = grn_nfkc_normalize_unify_katakana_case( + unifying, data->unified_kana_case); + } + break; + default : + break; } - break; - default : - break; } - } - if (data->options->unify_kana_voiced_sound_mark) { - switch (char_type) { - case GRN_CHAR_HIRAGANA : - if (data->lp == 3) { - data->p = grn_nfkc_normalize_unify_hiragana_voiced_sound_mark( - data->p, data->unified_kana_voiced_sound_mark); + if (data->options->unify_kana_voiced_sound_mark) { + switch (char_type) { + case GRN_CHAR_HIRAGANA : + if (unified_char_length == 3) { + unifying = grn_nfkc_normalize_unify_hiragana_voiced_sound_mark( + unifying, data->unified_kana_voiced_sound_mark); + } + break; + case GRN_CHAR_KATAKANA : + if (unified_char_length == 3) { + unifying = grn_nfkc_normalize_unify_katakana_voiced_sound_mark( + unifying, data->unified_kana_voiced_sound_mark); + } + break; + default : + break; } - break; - case GRN_CHAR_KATAKANA : - if (data->lp == 3) { - data->p = grn_nfkc_normalize_unify_katakana_voiced_sound_mark( - data->p, data->unified_kana_voiced_sound_mark); + } + + if (data->options->unify_hyphen) { + if (grn_nfkc_normalize_is_hyphen_famity(unifying, unified_char_length)) { + unifying = data->unified_hyphen; + unified_char_length = sizeof(data->unified_hyphen); + char_type = GRN_CHAR_SYMBOL; } - break; - default : - break; } - } - if (data->options->unify_hyphen) { - if (grn_nfkc_normalize_is_hyphen_famity(data->p, data->lp)) { - data->p = data->unified_hyphen; - data->lp = sizeof(data->unified_hyphen); - char_type = GRN_CHAR_SYMBOL; + if (data->options->unify_prolonged_sound_mark) { + if (grn_nfkc_normalize_is_prolonged_sound_mark_famity(unifying, + unified_char_length)) { + unifying = data->unified_prolonged_sound_mark; + unified_char_length = sizeof(data->unified_prolonged_sound_mark); + char_type = GRN_CHAR_KATAKANA; + } } - } - if (data->options->unify_prolonged_sound_mark) { - if (grn_nfkc_normalize_is_prolonged_sound_mark_famity(data->p, data->lp)) { - data->p = data->unified_prolonged_sound_mark; - data->lp = sizeof(data->unified_prolonged_sound_mark); - char_type = GRN_CHAR_KATAKANA; + if (data->options->unify_hyphen_and_prolonged_sound_mark) { + if (grn_nfkc_normalize_is_hyphen_famity(unifying, unified_char_length) || + grn_nfkc_normalize_is_prolonged_sound_mark_famity(unifying, + unified_char_length)) { + unifying = data->unified_hyphen; + unified_char_length = sizeof(data->unified_hyphen); + char_type = GRN_CHAR_SYMBOL; + } } - } - if (data->options->unify_hyphen_and_prolonged_sound_mark) { - if (grn_nfkc_normalize_is_hyphen_famity(data->p, data->lp) || - grn_nfkc_normalize_is_prolonged_sound_mark_famity(data->p, data->lp)) { - data->p = data->unified_hyphen; - data->lp = sizeof(data->unified_hyphen); - char_type = GRN_CHAR_SYMBOL; + if (data->options->unify_middle_dot) { + if (grn_nfkc_normalize_is_middle_dot_family(unifying, + unified_char_length)) { + unifying = data->unified_middle_dot; + unified_char_length = sizeof(data->unified_middle_dot); + char_type = GRN_CHAR_SYMBOL; + } } - } - if (data->options->unify_middle_dot) { - if (grn_nfkc_normalize_is_middle_dot_family(data->p, data->lp)) { - data->p = data->unified_middle_dot; - data->lp = sizeof(data->unified_middle_dot); - char_type = GRN_CHAR_SYMBOL; + if (data->options->unify_katakana_v_sounds) { + if (grn_nfkc_normalize_unify_katakana_v_sounds(unifying, + unified_char_length, + unified_previous, + unified_current)) { + skip = GRN_TRUE; + } } - } - if (data->options->unify_katakana_v_sounds) { - if (grn_nfkc_normalize_unify_katakana_v_sounds(data->p, data->lp, data->d_, data->d)) { - data->lp = 0; + if (data->options->unify_katakana_bu_sound) { + if (grn_nfkc_normalize_unify_katakana_bu_sound(unifying, + unified_char_length, + unified_previous, + unified_current)) { + skip = GRN_TRUE; + } } - } - if (data->options->unify_katakana_bu_sound) { - if (grn_nfkc_normalize_unify_katakana_bu_sound(data->p, data->lp, data->d_, data->d)) { - data->lp = 0; + if (!skip) { + if (unified_current + unified_char_length >= unified_end) { + /* TODO: Expand automatically. */ + ERR(GRN_NO_MEMORY_AVAILABLE, + "[normalize][nfkc] too large unified data"); + goto exit; + } + grn_memcpy(unified_current, unifying, unified_char_length); + unified_previous = unified_current; + unified_current += unified_char_length; + unified_n_characters++; + if (unified_char_types_current) { + *(unified_char_types_current++) = char_type; + } + if (unified_checks_current) { + size_t i; + *(unified_checks_current++) = data->string->checks[i_byte]; + for (i = 1; i < unified_char_length; i++) { + *(unified_checks_current++) = 0; + } + } + if (unified_offsets_current) { + *(unified_offsets_current++) = data->string->offsets[i_character]; + } } + + i_byte += char_length; + current += char_length; + i_character++; + } + if (data->options->unify_katakana_v_sounds) { + grn_nfkc_normalize_unify_katakana_v_sounds(NULL, + 0, + unified_previous, + unified_current); + } + if (data->options->unify_katakana_bu_sound) { + grn_nfkc_normalize_unify_katakana_bu_sound(NULL, + 0, + unified_previous, + unified_current); } - return char_type; + GRN_FREE(data->string->normalized); + if (data->string->checks) { + GRN_FREE(data->string->checks); + } + if (data->string->ctypes) { + GRN_FREE(data->string->ctypes); + } + if (data->string->offsets) { + GRN_FREE(data->string->offsets); + } + data->string->normalized = unified; + data->d = unified_current; + data->d_ = unified_previous; + data->string->checks = unified_checks; + data->ch = unified_checks_current; + data->string->ctypes = unified_char_types; + data->cp = unified_char_types_current; + data->string->offsets = unified_offsets; + data->offsets = unified_offsets_current; + data->length = unified_n_characters; + unified = NULL; + unified_checks = NULL; + unified_char_types = NULL; + unified_offsets = NULL; + +exit: + if (unified) { + GRN_FREE(unified); + } + if (unified_checks) { + GRN_FREE(unified_checks); + } + if (unified_char_types) { + GRN_FREE(unified_char_types); + } + if (unified_offsets) { + GRN_FREE(unified_offsets); + } } grn_rc @@ -1285,10 +1450,6 @@ grn_nfkc_normalize(grn_ctx *ctx, data.s_ += data.lp; } } else { - size_t lp_original = data.lp; - grn_char_type char_type; - char_type = data.options->char_type_func(data.p); - if (data.de <= data.d + data.lp) { grn_nfkc_normalize_expand(ctx, &data); if (ctx->rc != GRN_SUCCESS) { @@ -1296,18 +1457,16 @@ grn_nfkc_normalize(grn_ctx *ctx, } } - { - const unsigned char *p = data.p; - char_type = grn_nfkc_normalize_unify(ctx, &data, char_type); - grn_memcpy(data.d, data.p, data.lp); - data.p = p; - } - + grn_memcpy(data.d, data.p, data.lp); data.d_ = data.d; if (data.lp > 0) { data.d += data.lp; data.length++; - if (data.cp) { *(data.cp++) = char_type; } + if (data.cp) { + grn_char_type char_type; + char_type = data.options->char_type_func(data.p); + *(data.cp++) = char_type; + } if (data.ch) { size_t i; if (data.s_ == data.s + data.ls) { @@ -1324,18 +1483,12 @@ grn_nfkc_normalize(grn_ctx *ctx, (uint64_t)(data.s - (const unsigned char *)(data.string->original)); } } - data.lp = lp_original; } } } + grn_nfkc_normalize_unify(ctx, &data); if (data.cp) { *(data.cp) = GRN_CHAR_NULL; } if (data.offsets) { *(data.offsets) = data.string->original_length_in_bytes; } - if (data.options->unify_katakana_v_sounds) { - grn_nfkc_normalize_unify_katakana_v_sounds(NULL, 0, data.d_, data.d); - } - if (data.options->unify_katakana_bu_sound) { - grn_nfkc_normalize_unify_katakana_bu_sound(NULL, 0, data.d_, data.d); - } *(data.d) = '\0'; data.string->n_characters = data.length; data.string->normalized_length_in_bytes = -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181105/36d06d86/attachment-0001.html>