Kouhei Sutou
null+****@clear*****
Tue Feb 28 13:33:47 JST 2017
Kouhei Sutou 2017-02-28 13:33:47 +0900 (Tue, 28 Feb 2017) New Revision: 8a8441144be5c4284c78db2c4561ccd6c5a84449 https://github.com/groonga/groonga/commit/8a8441144be5c4284c78db2c4561ccd6c5a84449 Message: ii: use the previous buffer allocation algorithm again The algorithm is used until 7dd19103de5df2f8c0af7ac47b9149d421a0aa5d . The algorithm is for natural language. It's suitable for natural language but it's not good performance for some non natural language text data. So we introduce the new buffer allocation algorithm. But the new buffer allocation algorithm increases index size for natural language. So we use the previous buffer allocation algorithm again just for natural language text. We assume that target text is natural language if lexicon has a tokenizer. Modified files: lib/ii.c Modified: lib/ii.c (+59 -32) =================================================================== --- lib/ii.c 2017-02-28 13:07:37 +0900 (eca9b0c) +++ lib/ii.c 2017-02-28 13:33:47 +0900 (2495b22) @@ -4045,53 +4045,80 @@ buffer_new_lexicon_pat(grn_ctx *ctx, key_size = grn_table_get_key(ctx, ii->lexicon, id, key, GRN_TABLE_MAX_KEY_SIZE); if (ii->lexicon->header.flags & GRN_OBJ_KEY_VAR_SIZE) { - int target_key_size = key_size; - int reduced_key_size = 0; - - while (*lseg == NOT_ASSIGNED && target_key_size > 0) { - grn_id tid; + grn_obj *tokenizer = NULL; + grn_table_get_info(ctx, ii->lexicon, NULL, NULL, &tokenizer, NULL, NULL); + if (tokenizer) { + /* For natural language */ cursor = grn_pat_cursor_open(ctx, (grn_pat *)(ii->lexicon), - key, target_key_size, - NULL, 0, 0, -1, - GRN_CURSOR_PREFIX); - if (!cursor) { - break; - } - - if (reduced_key_size == 0) { + key, + key_size, + NULL, + 0, + 0, + -1, + GRN_CURSOR_ASCENDING|GRN_CURSOR_GT); + if (cursor) { + grn_id tid; while (ctx->rc == GRN_SUCCESS && *lseg == NOT_ASSIGNED && (tid = grn_pat_cursor_next(ctx, cursor))) { buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); } - } else { - while (ctx->rc == GRN_SUCCESS && - *lseg == NOT_ASSIGNED && - (tid = grn_pat_cursor_next(ctx, cursor))) { - void *current_key; - int current_key_size; + grn_pat_cursor_close(ctx, cursor); + } + } else { + /* For text data */ + int target_key_size = key_size; + int reduced_key_size = 0; - current_key_size = grn_pat_cursor_get_key(ctx, cursor, ¤t_key); - if (memcmp(((char *)current_key) + target_key_size, - key + target_key_size, - reduced_key_size) == 0) { - continue; + while (*lseg == NOT_ASSIGNED && target_key_size > 0) { + grn_id tid; + + cursor = grn_pat_cursor_open(ctx, + (grn_pat *)(ii->lexicon), + key, target_key_size, + NULL, 0, 0, -1, + GRN_CURSOR_PREFIX); + if (!cursor) { + break; + } + + if (reduced_key_size == 0) { + while (ctx->rc == GRN_SUCCESS && + *lseg == NOT_ASSIGNED && + (tid = grn_pat_cursor_next(ctx, cursor))) { + buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); + } + } else { + while (ctx->rc == GRN_SUCCESS && + *lseg == NOT_ASSIGNED && + (tid = grn_pat_cursor_next(ctx, cursor))) { + void *current_key; + int current_key_size; + + current_key_size = grn_pat_cursor_get_key(ctx, cursor, ¤t_key); + if (memcmp(((char *)current_key) + target_key_size, + key + target_key_size, + reduced_key_size) == 0) { + continue; + } + buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); } - buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); } - } - grn_pat_cursor_close(ctx, cursor); + grn_pat_cursor_close(ctx, cursor); - if (reduced_key_size == 0) { - reduced_key_size = 1; - } else { - reduced_key_size *= 2; + if (reduced_key_size == 0) { + reduced_key_size = 1; + } else { + reduced_key_size *= 2; + } + target_key_size -= reduced_key_size; } - target_key_size -= reduced_key_size; } } else { + /* For other data */ cursor = grn_pat_cursor_open(ctx, (grn_pat *)(ii->lexicon), NULL, 0, key, key_size, 0, -1, -------------- next part -------------- HTML����������������������������...下载