Kouhei Sutou 2018-11-01 16:08:29 +0900 (Thu, 01 Nov 2018) Revision: 89c0fd9fc20ac09091024bad8acc889206a17391 https://github.com/groonga/groonga/commit/89c0fd9fc20ac09091024bad8acc889206a17391 Message: Add built-in TokenFilterNFKC100 token filter Added files: lib/grn_token_filters.h lib/token_filters.c test/command/suite/token_filters/nfkc100/unify_kana.expected test/command/suite/token_filters/nfkc100/unify_kana.test Modified files: lib/c_sources.am lib/db.c test/command/suite/schema/plugins.expected test/command/suite/schema/tables/columns/compress/lz4.expected test/command/suite/schema/tables/columns/compress/zlib.expected test/command/suite/schema/tables/columns/compress/zstd.expected test/command/suite/schema/tables/columns/type/index_medium.expected test/command/suite/schema/tables/columns/type/index_small.expected test/command/suite/schema/tables/columns/type/scalar.expected test/command/suite/schema/tables/columns/type/vector.expected test/command/suite/schema/tables/normalizer.expected test/command/suite/schema/tables/normalizer_with_options.expected test/command/suite/schema/tables/token_filters.expected test/command/suite/schema/tables/token_filters_with_options.expected test/command/suite/schema/tables/tokenizer.expected test/command/suite/schema/tables/tokenizer_with_options.expected test/command/suite/schema/tables/type/array.expected test/command/suite/schema/tables/type/hash_table.expected test/command/suite/schema/tables/value_type/reference.expected test/command/suite/schema/tables/value_type/type.expected Modified: lib/c_sources.am (+2 -0) =================================================================== --- lib/c_sources.am 2018-11-01 16:08:01 +0900 (287f7f8e5) +++ lib/c_sources.am 2018-11-01 16:08:29 +0900 (a7a647f19) @@ -114,6 +114,8 @@ libgroonga_c_sources = \ tokenizers.c \ grn_tokenizers.h \ token_filter.c \ + token_filters.c \ + grn_token_filters.h \ util.c \ grn_util.h \ windows.c \ Modified: lib/db.c (+3 -0) =================================================================== --- lib/db.c 2018-11-01 16:08:01 +0900 (3b6a7a60e) +++ lib/db.c 2018-11-01 16:08:29 +0900 (a741f2fa2) @@ -31,6 +31,7 @@ #include "grn_ctx_impl.h" #include "grn_token_cursor.h" #include "grn_tokenizers.h" +#include "grn_token_filters.h" #include "grn_proc.h" #include "grn_plugin.h" #include "grn_geo.h" @@ -408,6 +409,7 @@ grn_db_open(grn_ctx *ctx, const char *path) grn_db_init_builtin_scorers(ctx); grn_db_init_builtin_commands(ctx); grn_db_init_builtin_window_functions(ctx); + grn_db_init_builtin_token_filters(ctx); if (grn_table_size(ctx, (grn_obj *)s) > n_records) { grn_obj_flush(ctx, (grn_obj *)s); @@ -13308,6 +13310,7 @@ grn_db_init_builtin_types(grn_ctx *ctx) } grn_db_init_builtin_commands(ctx); grn_db_init_builtin_window_functions(ctx); + grn_db_init_builtin_token_filters(ctx); for (id = grn_db_curr_id(ctx, db) + 1; id < GRN_N_RESERVED_TYPES; id++) { grn_itoh(id, buf + 3, 2); grn_obj_register(ctx, db, buf, 5); Added: lib/grn_token_filters.h (+31 -0) 100644 =================================================================== --- /dev/null +++ lib/grn_token_filters.h 2018-11-01 16:08:29 +0900 (48307fe67) @@ -0,0 +1,31 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2018 Kouhei Sutou <kou****@clear*****> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#pragma once + +#include "grn_ctx.h" + +#ifdef __cplusplus +extern "C" { +#endif + +grn_rc grn_db_init_builtin_token_filters(grn_ctx *ctx); + +#ifdef __cplusplus +} +#endif Added: lib/token_filters.c (+167 -0) 100644 =================================================================== --- /dev/null +++ lib/token_filters.c 2018-11-01 16:08:29 +0900 (c25ed476e) @@ -0,0 +1,167 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2018 Kouhei Sutou <kou****@clear*****> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "grn_token_filters.h" +#include "grn_normalizer.h" + +#include <groonga/token_filter.h> + +typedef struct { + grn_nfkc_normalize_options *options; + grn_string string; + grn_tokenizer_token token; +} grn_nfkc100_token_filter; + +static void * +nfkc100_open_options(grn_ctx *ctx, + grn_obj *token_filter, + grn_obj *raw_options, + void *user_data) +{ + grn_nfkc_normalize_options *options; + + options = GRN_MALLOC(sizeof(grn_nfkc_normalize_options)); + if (!options) { + GRN_PLUGIN_ERROR(ctx, + GRN_NO_MEMORY_AVAILABLE, + "[token-filter][nfkc100] " + "failed to allocate memory for options"); + return NULL; + } + + grn_nfkc100_normalize_options_init(ctx, options); + + grn_nfkc_normalize_options_apply(ctx, options, raw_options); + + return options; +} + +static void +nfkc100_close_options(grn_ctx *ctx, void *data) +{ + grn_nfkc_normalize_options *options = data; + grn_nfkc_normalize_options_fin(ctx, options); + GRN_PLUGIN_FREE(ctx, options); +} + +static void * +nfkc100_init(grn_ctx *ctx, grn_tokenizer_query *query) +{ + grn_obj *lexicon; + unsigned int i; + grn_nfkc_normalize_options *options; + grn_nfkc100_token_filter *token_filter; + + lexicon = grn_tokenizer_query_get_lexicon(ctx, query); + i = grn_tokenizer_query_get_token_filter_index(ctx, query); + options = grn_table_cache_token_filter_options(ctx, + lexicon, + i, + nfkc100_open_options, + nfkc100_close_options, + NULL); + if (ctx->rc != GRN_SUCCESS) { + return NULL; + } + + token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_nfkc100_token_filter)); + if (!token_filter) { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][nfkc100] " + "failed to allocate grn_nfkc100_token_filter"); + return NULL; + } + + token_filter->options = options; + grn_string_init(ctx, + (grn_obj *)(&(token_filter->string)), + grn_ctx_get(ctx, "NormalizerNFKC100", -1), + 0, + GRN_CTX_GET_ENCODING(ctx)); + grn_tokenizer_token_init(ctx, &(token_filter->token)); + + return token_filter; +} + +static void +nfkc100_filter(grn_ctx *ctx, + grn_token *current_token, + grn_token *next_token, + void *user_data) +{ + grn_nfkc100_token_filter *token_filter = user_data; + grn_id id; + grn_obj *data; + grn_obj *string; + + if (!token_filter) { + return; + } + + if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) { + return; + } + + data = grn_token_get_data(ctx, current_token); + string = (grn_obj *)(&(token_filter->string)); + grn_string_set_original(ctx, + string, + GRN_TEXT_VALUE(data), + GRN_TEXT_LEN(data)); + grn_nfkc_normalize(ctx, + string, + token_filter->options); + { + const char *normalized; + unsigned int normalized_length; + grn_string_get_normalized(ctx, + string, + &normalized, + &normalized_length, + NULL); + grn_token_set_data(ctx, next_token, normalized, normalized_length); + } +} + +static void +nfkc100_fin(grn_ctx *ctx, void *user_data) +{ + grn_nfkc100_token_filter *token_filter = user_data; + + if (!token_filter) { + return; + } + + grn_tokenizer_token_fin(ctx, &(token_filter->token)); + grn_string_fin(ctx, (grn_obj *)(&(token_filter->string))); + GRN_PLUGIN_FREE(ctx, token_filter); +} + +grn_rc +grn_db_init_builtin_token_filters(grn_ctx *ctx) +{ + { + grn_obj *token_filter; + token_filter = grn_token_filter_create(ctx, "TokenFilterNFKC100", -1); + grn_token_filter_set_init_func(ctx, token_filter, nfkc100_init); + grn_token_filter_set_filter_func(ctx, token_filter, nfkc100_filter); + grn_token_filter_set_fin_func(ctx, token_filter, nfkc100_fin); + } + + return GRN_SUCCESS; +} Modified: test/command/suite/schema/plugins.expected (+4 -0) =================================================================== --- test/command/suite/schema/plugins.expected 2018-11-01 16:08:01 +0900 (1ddc055f1) +++ test/command/suite/schema/plugins.expected 2018-11-01 16:08:29 +0900 (9063bca59) @@ -211,6 +211,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { } Modified: test/command/suite/schema/tables/columns/compress/lz4.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/compress/lz4.expected 2018-11-01 16:08:01 +0900 (ac1adadd1) +++ test/command/suite/schema/tables/columns/compress/lz4.expected 2018-11-01 16:08:29 +0900 (b34bb9321) @@ -210,6 +210,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Logs": { Modified: test/command/suite/schema/tables/columns/compress/zlib.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/compress/zlib.expected 2018-11-01 16:08:01 +0900 (28ebf9471) +++ test/command/suite/schema/tables/columns/compress/zlib.expected 2018-11-01 16:08:29 +0900 (f77305b4d) @@ -210,6 +210,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Logs": { Modified: test/command/suite/schema/tables/columns/compress/zstd.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/compress/zstd.expected 2018-11-01 16:08:01 +0900 (470518c79) +++ test/command/suite/schema/tables/columns/compress/zstd.expected 2018-11-01 16:08:29 +0900 (51c585f7c) @@ -210,6 +210,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Logs": { Modified: test/command/suite/schema/tables/columns/type/index_medium.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/type/index_medium.expected 2018-11-01 16:08:01 +0900 (aa9f4a989) +++ test/command/suite/schema/tables/columns/type/index_medium.expected 2018-11-01 16:08:29 +0900 (848b60b62) @@ -216,6 +216,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Posts": { Modified: test/command/suite/schema/tables/columns/type/index_small.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/type/index_small.expected 2018-11-01 16:08:01 +0900 (010c2f075) +++ test/command/suite/schema/tables/columns/type/index_small.expected 2018-11-01 16:08:29 +0900 (59ae37fae) @@ -216,6 +216,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Posts": { Modified: test/command/suite/schema/tables/columns/type/scalar.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/type/scalar.expected 2018-11-01 16:08:01 +0900 (91c546c87) +++ test/command/suite/schema/tables/columns/type/scalar.expected 2018-11-01 16:08:29 +0900 (6f77a4932) @@ -210,6 +210,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Logs": { Modified: test/command/suite/schema/tables/columns/type/vector.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/type/vector.expected 2018-11-01 16:08:01 +0900 (1667a56ed) +++ test/command/suite/schema/tables/columns/type/vector.expected 2018-11-01 16:08:29 +0900 (3ccfabc9c) @@ -212,6 +212,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Posts": { Modified: test/command/suite/schema/tables/normalizer.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/normalizer.expected 2018-11-01 16:08:01 +0900 (89aca91fd) +++ test/command/suite/schema/tables/normalizer.expected 2018-11-01 16:08:29 +0900 (90a36ebbf) @@ -208,6 +208,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Tags": { Modified: test/command/suite/schema/tables/normalizer_with_options.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/normalizer_with_options.expected 2018-11-01 16:08:01 +0900 (9815a7bb5) +++ test/command/suite/schema/tables/normalizer_with_options.expected 2018-11-01 16:08:29 +0900 (e3425678e) @@ -208,6 +208,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Tags": { Modified: test/command/suite/schema/tables/token_filters.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/token_filters.expected 2018-11-01 16:08:01 +0900 (19d2e1484) +++ test/command/suite/schema/tables/token_filters.expected 2018-11-01 16:08:29 +0900 (a5a6555f2) @@ -213,6 +213,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + }, "TokenFilterStopWord": { "id": 256, "name": "TokenFilterStopWord" Modified: test/command/suite/schema/tables/token_filters_with_options.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/token_filters_with_options.expected 2018-11-01 16:08:01 +0900 (d70d95796) +++ test/command/suite/schema/tables/token_filters_with_options.expected 2018-11-01 16:08:29 +0900 (912c430a1) @@ -213,6 +213,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + }, "TokenFilterStopWord": { "id": 256, "name": "TokenFilterStopWord" Modified: test/command/suite/schema/tables/tokenizer.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/tokenizer.expected 2018-11-01 16:08:01 +0900 (026790528) +++ test/command/suite/schema/tables/tokenizer.expected 2018-11-01 16:08:29 +0900 (7ab8fdb57) @@ -208,6 +208,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Terms": { Modified: test/command/suite/schema/tables/tokenizer_with_options.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/tokenizer_with_options.expected 2018-11-01 16:08:01 +0900 (c7ed034f7) +++ test/command/suite/schema/tables/tokenizer_with_options.expected 2018-11-01 16:08:29 +0900 (a1317090d) @@ -208,6 +208,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Terms": { Modified: test/command/suite/schema/tables/type/array.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/type/array.expected 2018-11-01 16:08:01 +0900 (94661bca3) +++ test/command/suite/schema/tables/type/array.expected 2018-11-01 16:08:29 +0900 (abba3c010) @@ -208,6 +208,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Logs": { Modified: test/command/suite/schema/tables/type/hash_table.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/type/hash_table.expected 2018-11-01 16:08:01 +0900 (a540989b7) +++ test/command/suite/schema/tables/type/hash_table.expected 2018-11-01 16:08:29 +0900 (3a16a8f9c) @@ -208,6 +208,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Users": { Modified: test/command/suite/schema/tables/value_type/reference.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/value_type/reference.expected 2018-11-01 16:08:01 +0900 (a566e438f) +++ test/command/suite/schema/tables/value_type/reference.expected 2018-11-01 16:08:29 +0900 (da158d59c) @@ -210,6 +210,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Logs": { Modified: test/command/suite/schema/tables/value_type/type.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/value_type/type.expected 2018-11-01 16:08:01 +0900 (b3a0afd59) +++ test/command/suite/schema/tables/value_type/type.expected 2018-11-01 16:08:29 +0900 (b5b0877bf) @@ -208,6 +208,10 @@ schema } }, "token_filters": { + "TokenFilterNFKC100": { + "id": 214, + "name": "TokenFilterNFKC100" + } }, "tables": { "Logs": { Added: test/command/suite/token_filters/nfkc100/unify_kana.expected (+25 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/token_filters/nfkc100/unify_kana.expected 2018-11-01 16:08:29 +0900 (796a9f213) @@ -0,0 +1,25 @@ +tokenize TokenDelimit "リンゴ りんご 林檎" --token_filters 'TokenFilterNFKC100("unify_kana", true)' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "りんご", + "position": 0, + "force_prefix": false + }, + { + "value": "りんご", + "position": 1, + "force_prefix": false + }, + { + "value": "林檎", + "position": 2, + "force_prefix": false + } + ] +] Added: test/command/suite/token_filters/nfkc100/unify_kana.test (+2 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/token_filters/nfkc100/unify_kana.test 2018-11-01 16:08:29 +0900 (c2634c39c) @@ -0,0 +1,2 @@ +tokenize TokenDelimit "リンゴ りんご 林檎" \ + --token_filters 'TokenFilterNFKC100("unify_kana", true)' -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181101/3332f7cb/attachment-0001.html>