[Groonga-commit] groonga/groonga at 89c0fd9 [master] Add built-in TokenFilterNFKC100 token filter

Back to archive index
Kouhei Sutou null+****@clear*****
Thu Nov 1 16:08:29 JST 2018


Kouhei Sutou	2018-11-01 16:08:29 +0900 (Thu, 01 Nov 2018)

  Revision: 89c0fd9fc20ac09091024bad8acc889206a17391
  https://github.com/groonga/groonga/commit/89c0fd9fc20ac09091024bad8acc889206a17391

  Message:
    Add built-in TokenFilterNFKC100 token filter

  Added files:
    lib/grn_token_filters.h
    lib/token_filters.c
    test/command/suite/token_filters/nfkc100/unify_kana.expected
    test/command/suite/token_filters/nfkc100/unify_kana.test
  Modified files:
    lib/c_sources.am
    lib/db.c
    test/command/suite/schema/plugins.expected
    test/command/suite/schema/tables/columns/compress/lz4.expected
    test/command/suite/schema/tables/columns/compress/zlib.expected
    test/command/suite/schema/tables/columns/compress/zstd.expected
    test/command/suite/schema/tables/columns/type/index_medium.expected
    test/command/suite/schema/tables/columns/type/index_small.expected
    test/command/suite/schema/tables/columns/type/scalar.expected
    test/command/suite/schema/tables/columns/type/vector.expected
    test/command/suite/schema/tables/normalizer.expected
    test/command/suite/schema/tables/normalizer_with_options.expected
    test/command/suite/schema/tables/token_filters.expected
    test/command/suite/schema/tables/token_filters_with_options.expected
    test/command/suite/schema/tables/tokenizer.expected
    test/command/suite/schema/tables/tokenizer_with_options.expected
    test/command/suite/schema/tables/type/array.expected
    test/command/suite/schema/tables/type/hash_table.expected
    test/command/suite/schema/tables/value_type/reference.expected
    test/command/suite/schema/tables/value_type/type.expected

  Modified: lib/c_sources.am (+2 -0)
===================================================================
--- lib/c_sources.am    2018-11-01 16:08:01 +0900 (287f7f8e5)
+++ lib/c_sources.am    2018-11-01 16:08:29 +0900 (a7a647f19)
@@ -114,6 +114,8 @@ libgroonga_c_sources =				\
 	tokenizers.c				\
 	grn_tokenizers.h			\
 	token_filter.c				\
+	token_filters.c				\
+	grn_token_filters.h			\
 	util.c					\
 	grn_util.h				\
 	windows.c				\

  Modified: lib/db.c (+3 -0)
===================================================================
--- lib/db.c    2018-11-01 16:08:01 +0900 (3b6a7a60e)
+++ lib/db.c    2018-11-01 16:08:29 +0900 (a741f2fa2)
@@ -31,6 +31,7 @@
 #include "grn_ctx_impl.h"
 #include "grn_token_cursor.h"
 #include "grn_tokenizers.h"
+#include "grn_token_filters.h"
 #include "grn_proc.h"
 #include "grn_plugin.h"
 #include "grn_geo.h"
@@ -408,6 +409,7 @@ grn_db_open(grn_ctx *ctx, const char *path)
     grn_db_init_builtin_scorers(ctx);
     grn_db_init_builtin_commands(ctx);
     grn_db_init_builtin_window_functions(ctx);
+    grn_db_init_builtin_token_filters(ctx);
 
     if (grn_table_size(ctx, (grn_obj *)s) > n_records) {
       grn_obj_flush(ctx, (grn_obj *)s);
@@ -13308,6 +13310,7 @@ grn_db_init_builtin_types(grn_ctx *ctx)
   }
   grn_db_init_builtin_commands(ctx);
   grn_db_init_builtin_window_functions(ctx);
+  grn_db_init_builtin_token_filters(ctx);
   for (id = grn_db_curr_id(ctx, db) + 1; id < GRN_N_RESERVED_TYPES; id++) {
     grn_itoh(id, buf + 3, 2);
     grn_obj_register(ctx, db, buf, 5);

  Added: lib/grn_token_filters.h (+31 -0) 100644
===================================================================
--- /dev/null
+++ lib/grn_token_filters.h    2018-11-01 16:08:29 +0900 (48307fe67)
@@ -0,0 +1,31 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2018 Kouhei Sutou <kou****@clear*****>
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#pragma once
+
+#include "grn_ctx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+grn_rc grn_db_init_builtin_token_filters(grn_ctx *ctx);
+
+#ifdef __cplusplus
+}
+#endif

  Added: lib/token_filters.c (+167 -0) 100644
===================================================================
--- /dev/null
+++ lib/token_filters.c    2018-11-01 16:08:29 +0900 (c25ed476e)
@@ -0,0 +1,167 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2018 Kouhei Sutou <kou****@clear*****>
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include "grn_token_filters.h"
+#include "grn_normalizer.h"
+
+#include <groonga/token_filter.h>
+
+typedef struct {
+  grn_nfkc_normalize_options *options;
+  grn_string string;
+  grn_tokenizer_token token;
+} grn_nfkc100_token_filter;
+
+static void *
+nfkc100_open_options(grn_ctx *ctx,
+                     grn_obj *token_filter,
+                     grn_obj *raw_options,
+                     void *user_data)
+{
+  grn_nfkc_normalize_options *options;
+
+  options = GRN_MALLOC(sizeof(grn_nfkc_normalize_options));
+  if (!options) {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_NO_MEMORY_AVAILABLE,
+                     "[token-filter][nfkc100] "
+                     "failed to allocate memory for options");
+    return NULL;
+  }
+
+  grn_nfkc100_normalize_options_init(ctx, options);
+
+  grn_nfkc_normalize_options_apply(ctx, options, raw_options);
+
+  return options;
+}
+
+static void
+nfkc100_close_options(grn_ctx *ctx, void *data)
+{
+  grn_nfkc_normalize_options *options = data;
+  grn_nfkc_normalize_options_fin(ctx, options);
+  GRN_PLUGIN_FREE(ctx, options);
+}
+
+static void *
+nfkc100_init(grn_ctx *ctx, grn_tokenizer_query *query)
+{
+  grn_obj *lexicon;
+  unsigned int i;
+  grn_nfkc_normalize_options *options;
+  grn_nfkc100_token_filter *token_filter;
+
+  lexicon = grn_tokenizer_query_get_lexicon(ctx, query);
+  i = grn_tokenizer_query_get_token_filter_index(ctx, query);
+  options = grn_table_cache_token_filter_options(ctx,
+                                                 lexicon,
+                                                 i,
+                                                 nfkc100_open_options,
+                                                 nfkc100_close_options,
+                                                 NULL);
+  if (ctx->rc != GRN_SUCCESS) {
+    return NULL;
+  }
+
+  token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_nfkc100_token_filter));
+  if (!token_filter) {
+    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+                     "[token-filter][nfkc100] "
+                     "failed to allocate grn_nfkc100_token_filter");
+    return NULL;
+  }
+
+  token_filter->options = options;
+  grn_string_init(ctx,
+                  (grn_obj *)(&(token_filter->string)),
+                  grn_ctx_get(ctx, "NormalizerNFKC100", -1),
+                  0,
+                  GRN_CTX_GET_ENCODING(ctx));
+  grn_tokenizer_token_init(ctx, &(token_filter->token));
+
+  return token_filter;
+}
+
+static void
+nfkc100_filter(grn_ctx *ctx,
+               grn_token *current_token,
+               grn_token *next_token,
+               void *user_data)
+{
+  grn_nfkc100_token_filter *token_filter = user_data;
+  grn_id id;
+  grn_obj *data;
+  grn_obj *string;
+
+  if (!token_filter) {
+    return;
+  }
+
+  if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) {
+    return;
+  }
+
+  data = grn_token_get_data(ctx, current_token);
+  string = (grn_obj *)(&(token_filter->string));
+  grn_string_set_original(ctx,
+                          string,
+                          GRN_TEXT_VALUE(data),
+                          GRN_TEXT_LEN(data));
+  grn_nfkc_normalize(ctx,
+                     string,
+                     token_filter->options);
+  {
+    const char *normalized;
+    unsigned int normalized_length;
+    grn_string_get_normalized(ctx,
+                              string,
+                              &normalized,
+                              &normalized_length,
+                              NULL);
+    grn_token_set_data(ctx, next_token, normalized, normalized_length);
+  }
+}
+
+static void
+nfkc100_fin(grn_ctx *ctx, void *user_data)
+{
+  grn_nfkc100_token_filter *token_filter = user_data;
+
+  if (!token_filter) {
+    return;
+  }
+
+  grn_tokenizer_token_fin(ctx, &(token_filter->token));
+  grn_string_fin(ctx, (grn_obj *)(&(token_filter->string)));
+  GRN_PLUGIN_FREE(ctx, token_filter);
+}
+
+grn_rc
+grn_db_init_builtin_token_filters(grn_ctx *ctx)
+{
+  {
+    grn_obj *token_filter;
+    token_filter = grn_token_filter_create(ctx, "TokenFilterNFKC100", -1);
+    grn_token_filter_set_init_func(ctx, token_filter, nfkc100_init);
+    grn_token_filter_set_filter_func(ctx, token_filter, nfkc100_filter);
+    grn_token_filter_set_fin_func(ctx, token_filter, nfkc100_fin);
+  }
+
+  return GRN_SUCCESS;
+}

  Modified: test/command/suite/schema/plugins.expected (+4 -0)
===================================================================
--- test/command/suite/schema/plugins.expected    2018-11-01 16:08:01 +0900 (1ddc055f1)
+++ test/command/suite/schema/plugins.expected    2018-11-01 16:08:29 +0900 (9063bca59)
@@ -211,6 +211,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
     }

  Modified: test/command/suite/schema/tables/columns/compress/lz4.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/columns/compress/lz4.expected    2018-11-01 16:08:01 +0900 (ac1adadd1)
+++ test/command/suite/schema/tables/columns/compress/lz4.expected    2018-11-01 16:08:29 +0900 (b34bb9321)
@@ -210,6 +210,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Logs": {

  Modified: test/command/suite/schema/tables/columns/compress/zlib.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/columns/compress/zlib.expected    2018-11-01 16:08:01 +0900 (28ebf9471)
+++ test/command/suite/schema/tables/columns/compress/zlib.expected    2018-11-01 16:08:29 +0900 (f77305b4d)
@@ -210,6 +210,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Logs": {

  Modified: test/command/suite/schema/tables/columns/compress/zstd.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/columns/compress/zstd.expected    2018-11-01 16:08:01 +0900 (470518c79)
+++ test/command/suite/schema/tables/columns/compress/zstd.expected    2018-11-01 16:08:29 +0900 (51c585f7c)
@@ -210,6 +210,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Logs": {

  Modified: test/command/suite/schema/tables/columns/type/index_medium.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/columns/type/index_medium.expected    2018-11-01 16:08:01 +0900 (aa9f4a989)
+++ test/command/suite/schema/tables/columns/type/index_medium.expected    2018-11-01 16:08:29 +0900 (848b60b62)
@@ -216,6 +216,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Posts": {

  Modified: test/command/suite/schema/tables/columns/type/index_small.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/columns/type/index_small.expected    2018-11-01 16:08:01 +0900 (010c2f075)
+++ test/command/suite/schema/tables/columns/type/index_small.expected    2018-11-01 16:08:29 +0900 (59ae37fae)
@@ -216,6 +216,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Posts": {

  Modified: test/command/suite/schema/tables/columns/type/scalar.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/columns/type/scalar.expected    2018-11-01 16:08:01 +0900 (91c546c87)
+++ test/command/suite/schema/tables/columns/type/scalar.expected    2018-11-01 16:08:29 +0900 (6f77a4932)
@@ -210,6 +210,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Logs": {

  Modified: test/command/suite/schema/tables/columns/type/vector.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/columns/type/vector.expected    2018-11-01 16:08:01 +0900 (1667a56ed)
+++ test/command/suite/schema/tables/columns/type/vector.expected    2018-11-01 16:08:29 +0900 (3ccfabc9c)
@@ -212,6 +212,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Posts": {

  Modified: test/command/suite/schema/tables/normalizer.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/normalizer.expected    2018-11-01 16:08:01 +0900 (89aca91fd)
+++ test/command/suite/schema/tables/normalizer.expected    2018-11-01 16:08:29 +0900 (90a36ebbf)
@@ -208,6 +208,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Tags": {

  Modified: test/command/suite/schema/tables/normalizer_with_options.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/normalizer_with_options.expected    2018-11-01 16:08:01 +0900 (9815a7bb5)
+++ test/command/suite/schema/tables/normalizer_with_options.expected    2018-11-01 16:08:29 +0900 (e3425678e)
@@ -208,6 +208,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Tags": {

  Modified: test/command/suite/schema/tables/token_filters.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/token_filters.expected    2018-11-01 16:08:01 +0900 (19d2e1484)
+++ test/command/suite/schema/tables/token_filters.expected    2018-11-01 16:08:29 +0900 (a5a6555f2)
@@ -213,6 +213,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      },
       "TokenFilterStopWord": {
         "id": 256,
         "name": "TokenFilterStopWord"

  Modified: test/command/suite/schema/tables/token_filters_with_options.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/token_filters_with_options.expected    2018-11-01 16:08:01 +0900 (d70d95796)
+++ test/command/suite/schema/tables/token_filters_with_options.expected    2018-11-01 16:08:29 +0900 (912c430a1)
@@ -213,6 +213,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      },
       "TokenFilterStopWord": {
         "id": 256,
         "name": "TokenFilterStopWord"

  Modified: test/command/suite/schema/tables/tokenizer.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/tokenizer.expected    2018-11-01 16:08:01 +0900 (026790528)
+++ test/command/suite/schema/tables/tokenizer.expected    2018-11-01 16:08:29 +0900 (7ab8fdb57)
@@ -208,6 +208,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Terms": {

  Modified: test/command/suite/schema/tables/tokenizer_with_options.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/tokenizer_with_options.expected    2018-11-01 16:08:01 +0900 (c7ed034f7)
+++ test/command/suite/schema/tables/tokenizer_with_options.expected    2018-11-01 16:08:29 +0900 (a1317090d)
@@ -208,6 +208,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Terms": {

  Modified: test/command/suite/schema/tables/type/array.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/type/array.expected    2018-11-01 16:08:01 +0900 (94661bca3)
+++ test/command/suite/schema/tables/type/array.expected    2018-11-01 16:08:29 +0900 (abba3c010)
@@ -208,6 +208,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Logs": {

  Modified: test/command/suite/schema/tables/type/hash_table.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/type/hash_table.expected    2018-11-01 16:08:01 +0900 (a540989b7)
+++ test/command/suite/schema/tables/type/hash_table.expected    2018-11-01 16:08:29 +0900 (3a16a8f9c)
@@ -208,6 +208,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Users": {

  Modified: test/command/suite/schema/tables/value_type/reference.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/value_type/reference.expected    2018-11-01 16:08:01 +0900 (a566e438f)
+++ test/command/suite/schema/tables/value_type/reference.expected    2018-11-01 16:08:29 +0900 (da158d59c)
@@ -210,6 +210,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Logs": {

  Modified: test/command/suite/schema/tables/value_type/type.expected (+4 -0)
===================================================================
--- test/command/suite/schema/tables/value_type/type.expected    2018-11-01 16:08:01 +0900 (b3a0afd59)
+++ test/command/suite/schema/tables/value_type/type.expected    2018-11-01 16:08:29 +0900 (b5b0877bf)
@@ -208,6 +208,10 @@ schema
       }
     },
     "token_filters": {
+      "TokenFilterNFKC100": {
+        "id": 214,
+        "name": "TokenFilterNFKC100"
+      }
     },
     "tables": {
       "Logs": {

  Added: test/command/suite/token_filters/nfkc100/unify_kana.expected (+25 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/token_filters/nfkc100/unify_kana.expected    2018-11-01 16:08:29 +0900 (796a9f213)
@@ -0,0 +1,25 @@
+tokenize TokenDelimit "リンゴ りんご 林檎"   --token_filters 'TokenFilterNFKC100("unify_kana", true)'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "りんご",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "りんご",
+      "position": 1,
+      "force_prefix": false
+    },
+    {
+      "value": "林檎",
+      "position": 2,
+      "force_prefix": false
+    }
+  ]
+]

  Added: test/command/suite/token_filters/nfkc100/unify_kana.test (+2 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/token_filters/nfkc100/unify_kana.test    2018-11-01 16:08:29 +0900 (c2634c39c)
@@ -0,0 +1,2 @@
+tokenize TokenDelimit "リンゴ りんご 林檎" \
+  --token_filters 'TokenFilterNFKC100("unify_kana", true)'
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181101/3332f7cb/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index