Kouhei Sutou
null+****@clear*****
Tue Apr 10 17:55:43 JST 2018
Kouhei Sutou 2018-04-10 17:55:43 +0900 (Tue, 10 Apr 2018) New Revision: f69194a967620025a0806a5f57e792a76cb7cd0a https://github.com/groonga/groonga/commit/f69194a967620025a0806a5f57e792a76cb7cd0a Message: Extract temporary lexicon creation code Added files: lib/proc/proc_lexicon.c Modified files: lib/grn_proc.h lib/proc/proc_tokenize.c lib/proc/sources.am Modified: lib/grn_proc.h (+6 -0) =================================================================== --- lib/grn_proc.h 2018-04-10 17:43:42 +0900 (5f01ad58a) +++ lib/grn_proc.h 2018-04-10 17:55:43 +0900 (06d135458) @@ -153,6 +153,12 @@ grn_expr_flags grn_proc_expr_query_flags_parse(grn_ctx *ctx, size_t query_flags_size, const char *error_message_tag); +grn_obj *grn_proc_lexicon_open(grn_ctx *ctx, + grn_raw_string *tokenizer_raw, + grn_raw_string *normalizer_raw, + grn_raw_string *token_filters_raw, + const char *context_tag); + #ifdef __cplusplus } #endif Added: lib/proc/proc_lexicon.c (+92 -0) 100644 =================================================================== --- /dev/null +++ lib/proc/proc_lexicon.c 2018-04-10 17:55:43 +0900 (ca5bbcba1) @@ -0,0 +1,92 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2009-2018 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "../grn_proc.h" +#include "../grn_ctx.h" + +#include <groonga/plugin.h> + +grn_obj * +grn_proc_lexicon_open(grn_ctx *ctx, + grn_raw_string *tokenizer_raw, + grn_raw_string *normalizer_raw, + grn_raw_string *token_filters_raw, + const char *context_tag) +{ + grn_obj *lexicon; + grn_obj *normalizer = NULL; + + if (normalizer_raw->length > 0) { + normalizer = grn_ctx_get(ctx, + normalizer_raw->value, + normalizer_raw->length); + if (!normalizer) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "%s nonexistent normalizer: <%.*s>", + context_tag, + (int)normalizer_raw->length, + normalizer_raw->value); + return NULL; + } + + if (!grn_obj_is_normalizer_proc(ctx, normalizer)) { + grn_obj inspected; + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, normalizer); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "%s not normalizer: %.*s", + context_tag, + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + grn_obj_unlink(ctx, normalizer); + return NULL; + } + } + + lexicon = grn_table_create(ctx, NULL, 0, + NULL, + GRN_OBJ_TABLE_HASH_KEY, + grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), + NULL); + { + grn_obj tokenizer; + GRN_TEXT_INIT(&tokenizer, GRN_OBJ_DO_SHALLOW_COPY); + GRN_TEXT_SET(ctx, &tokenizer, tokenizer_raw->value, tokenizer_raw->length); + grn_obj_set_info(ctx, lexicon, GRN_INFO_DEFAULT_TOKENIZER, &tokenizer); + GRN_OBJ_FIN(ctx, &tokenizer); + } + if (ctx->rc != GRN_SUCCESS) { + grn_obj_close(ctx, lexicon); + GRN_PLUGIN_ERROR(ctx, ctx->rc, + "%s failed to set tokenizer: <%.*s>: %s", + context_tag, + (int)(tokenizer_raw->length), + tokenizer_raw->value, + ctx->errbuf); + return NULL; + } + if (normalizer) { + grn_obj_set_info(ctx, lexicon, + GRN_INFO_NORMALIZER, normalizer); + grn_obj_unlink(ctx, normalizer); + } + grn_proc_table_set_token_filters(ctx, lexicon, token_filters_raw); + + return lexicon; +} Modified: lib/proc/proc_tokenize.c (+16 -73) =================================================================== --- lib/proc/proc_tokenize.c 2018-04-10 17:43:42 +0900 (8e9f0737c) +++ lib/proc/proc_tokenize.c 2018-04-10 17:55:43 +0900 (b26386d3c) @@ -117,72 +117,6 @@ output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon, grn_obj *index_co grn_ctx_output_array_close(ctx); } -static grn_obj * -create_lexicon_for_tokenize(grn_ctx *ctx, - grn_raw_string *tokenizer_raw, - grn_raw_string *normalizer_raw, - grn_raw_string *token_filters_raw) -{ - grn_obj *lexicon; - grn_obj *normalizer = NULL; - - if (normalizer_raw->length > 0) { - normalizer = grn_ctx_get(ctx, - normalizer_raw->value, - normalizer_raw->length); - if (!normalizer) { - GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, - "[tokenize] nonexistent normalizer: <%.*s>", - (int)normalizer_raw->length, - normalizer_raw->value); - return NULL; - } - - if (!grn_obj_is_normalizer_proc(ctx, normalizer)) { - grn_obj inspected; - GRN_TEXT_INIT(&inspected, 0); - grn_inspect(ctx, &inspected, normalizer); - GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, - "[tokenize] not normalizer: %.*s", - (int)GRN_TEXT_LEN(&inspected), - GRN_TEXT_VALUE(&inspected)); - GRN_OBJ_FIN(ctx, &inspected); - grn_obj_unlink(ctx, normalizer); - return NULL; - } - } - - lexicon = grn_table_create(ctx, NULL, 0, - NULL, - GRN_OBJ_TABLE_HASH_KEY, - grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), - NULL); - { - grn_obj tokenizer; - GRN_TEXT_INIT(&tokenizer, GRN_OBJ_DO_SHALLOW_COPY); - GRN_TEXT_SET(ctx, &tokenizer, tokenizer_raw->value, tokenizer_raw->length); - grn_obj_set_info(ctx, lexicon, GRN_INFO_DEFAULT_TOKENIZER, &tokenizer); - GRN_OBJ_FIN(ctx, &tokenizer); - } - if (ctx->rc != GRN_SUCCESS) { - grn_obj_close(ctx, lexicon); - GRN_PLUGIN_ERROR(ctx, ctx->rc, - "[tokenize] failed to set tokenizer: <%.*s>: %s", - (int)(tokenizer_raw->length), - tokenizer_raw->value, - ctx->errbuf); - return NULL; - } - if (normalizer) { - grn_obj_set_info(ctx, lexicon, - GRN_INFO_NORMALIZER, normalizer); - grn_obj_unlink(ctx, normalizer); - } - grn_proc_table_set_token_filters(ctx, lexicon, token_filters_raw); - - return lexicon; -} - static void tokenize(grn_ctx *ctx, grn_obj *lexicon, @@ -348,6 +282,7 @@ grn_proc_init_table_tokenize(grn_ctx *ctx) static grn_obj * command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { + const char *context_tag = "[tokenize]"; grn_raw_string tokenizer_raw; grn_raw_string string_raw; grn_raw_string normalizer_raw; @@ -373,12 +308,18 @@ command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_da #undef GET_VALUE if (tokenizer_raw.length == 0) { - GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing"); + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "%s tokenizer name is missing", + context_tag); return NULL; } if (string_raw.length == 0) { - GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] string is missing"); + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "%s string is missing", + context_tag); return NULL; } @@ -391,10 +332,11 @@ command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_da return NULL; } - lexicon = create_lexicon_for_tokenize(ctx, - &tokenizer_raw, - &normalizer_raw, - &token_filters_raw); + lexicon = grn_proc_lexicon_open(ctx, + &tokenizer_raw, + &normalizer_raw, + &token_filters_raw, + context_tag); if (!lexicon) { return NULL; } @@ -413,7 +355,8 @@ command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_da output_tokens(ctx, &tokens, lexicon, NULL); } else { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, - "[tokenize] invalid mode: <%.*s>", + "%s invalid mode: <%.*s>", + context_tag, (int)mode_raw.length, mode_raw.value); } Modified: lib/proc/sources.am (+1 -0) =================================================================== --- lib/proc/sources.am 2018-04-10 17:43:42 +0900 (48b411fa9) +++ lib/proc/sources.am 2018-04-10 17:55:43 +0900 (3ca24ee9f) @@ -5,6 +5,7 @@ libgrnproc_la_SOURCES = \ proc_fuzzy_search.c \ proc_highlight.c \ proc_in_records.c \ + proc_lexicon.c \ proc_lock.c \ proc_normalize.c \ proc_object.c \ -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180410/1fdb35d4/attachment-0001.htm