[Groonga-commit] groonga/groonga [master] Add grn_tokenizer_tokenized_delimiter_next()

Back to archive index

Kouhei Sutou null+****@clear*****
Tue Nov 13 11:23:18 JST 2012


Kouhei Sutou	2012-11-13 11:23:18 +0900 (Tue, 13 Nov 2012)

  New Revision: bf674e7d2ddaed3758ed704ee5c6785b8bca2d60
  https://github.com/groonga/groonga/commit/bf674e7d2ddaed3758ed704ee5c6785b8bca2d60

  Log:
    Add grn_tokenizer_tokenized_delimiter_next()

  Modified files:
    include/groonga/tokenizer.h
    lib/tokenizer.c

  Modified: include/groonga/tokenizer.h (+13 -0)
===================================================================
--- include/groonga/tokenizer.h    2012-11-13 11:22:32 +0900 (5298106)
+++ include/groonga/tokenizer.h    2012-11-13 11:23:18 +0900 (3f270df)
@@ -146,6 +146,19 @@ void grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token,
                               grn_tokenizer_status status);
 
 /*
+  grn_tokenizer_tokenized_delimiter_next() extracts the next token
+  from the string specified by `str_ptr' and `str_length' and pushes
+  the next token into `token'. It returns the string after the next
+  token. The returned string may be `NULL' when all tokens are
+  extracted.
+ */
+const char *grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx,
+                                                   grn_tokenizer_token *token,
+                                                   const char *str_ptr,
+                                                   unsigned int str_length,
+                                                   grn_encoding encoding);
+
+/*
   grn_tokenizer_register() registers a plugin to the database which is
   associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the
   plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and

  Modified: lib/tokenizer.c (+38 -0)
===================================================================
--- lib/tokenizer.c    2012-11-13 11:22:32 +0900 (566d403)
+++ lib/tokenizer.c    2012-11-13 11:23:18 +0900 (7bbcf7b)
@@ -242,6 +242,44 @@ grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token,
   grn_ctx_push(ctx, &token->status);
 }
 
+const char *
+grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx,
+                                       grn_tokenizer_token *token,
+                                       const char *str_ptr,
+                                       unsigned int str_length,
+                                       grn_encoding encoding)
+{
+  size_t char_length = 0;
+  const unsigned char *start = str_ptr;
+  const unsigned char *current;
+  const unsigned char *end = str_ptr + str_length;
+  const char *next_start = NULL;
+  unsigned int token_length;
+  grn_tokenizer_status status;
+
+  for (current = start; current < end; current += char_length) {
+    char_length = grn_charlen_(ctx, current, end, encoding);
+    if (char_length == 0) {
+      break;
+    }
+    if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length,
+                                             encoding)) {
+      next_start = str_ptr + (current - start + char_length);
+      break;
+    }
+  }
+
+  token_length = current - start;
+  if (current == end) {
+    status = GRN_TOKENIZER_LAST;
+  } else {
+    status = GRN_TOKENIZER_CONTINUE;
+  }
+  grn_tokenizer_token_push(ctx, token, start, token_length, status);
+
+  return next_start;
+}
+
 grn_rc
 grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
                        unsigned int plugin_name_length,
-------------- next part --------------
HTML����������������������������...
下载 



More information about the Groonga-commit mailing list
Back to archive index