groonga/groonga at 8a84411 [master] ii: use the previous buffer allocation algorithm again (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2017-02-28 13:33:47 +0900 (Tue, 28 Feb 2017)

  New Revision: 8a8441144be5c4284c78db2c4561ccd6c5a84449
  https://github.com/groonga/groonga/commit/8a8441144be5c4284c78db2c4561ccd6c5a84449

  Message:
    ii: use the previous buffer allocation algorithm again
    
    The algorithm is used until 7dd19103de5df2f8c0af7ac47b9149d421a0aa5d .
    
    The algorithm is for natural language. It's suitable for natural
    language but it's not good performance for some non natural language
    text data. So we introduce the new buffer allocation algorithm.
    
    But the new buffer allocation algorithm increases index size for natural
    language. So we use the previous buffer allocation algorithm again just
    for natural language text. We assume that target text is natural
    language if lexicon has a tokenizer.

  Modified files:
    lib/ii.c

  Modified: lib/ii.c (+59 -32)
===================================================================

--- lib/ii.c    2017-02-28 13:07:37 +0900 (eca9b0c)
+++ lib/ii.c    2017-02-28 13:33:47 +0900 (2495b22)
@@ -4045,53 +4045,80 @@ buffer_new_lexicon_pat(grn_ctx *ctx,
   key_size = grn_table_get_key(ctx, ii->lexicon, id, key,
                                GRN_TABLE_MAX_KEY_SIZE);
   if (ii->lexicon->header.flags & GRN_OBJ_KEY_VAR_SIZE) {
-    int target_key_size = key_size;
-    int reduced_key_size = 0;
-
-    while (*lseg == NOT_ASSIGNED && target_key_size > 0) {
-      grn_id tid;
+    grn_obj *tokenizer = NULL;
 
+    grn_table_get_info(ctx, ii->lexicon, NULL, NULL, &tokenizer, NULL, NULL);
+    if (tokenizer) {
+      /* For natural language */
       cursor = grn_pat_cursor_open(ctx,
                                    (grn_pat *)(ii->lexicon),
-                                   key, target_key_size,
-                                   NULL, 0, 0, -1,
-                                   GRN_CURSOR_PREFIX);
-      if (!cursor) {
-        break;
-      }
-
-      if (reduced_key_size == 0) {
+                                   key,
+                                   key_size,
+                                   NULL,
+                                   0,
+                                   0,
+                                   -1,
+                                   GRN_CURSOR_ASCENDING|GRN_CURSOR_GT);
+      if (cursor) {
+        grn_id tid;
         while (ctx->rc == GRN_SUCCESS &&
                *lseg == NOT_ASSIGNED &&
                (tid = grn_pat_cursor_next(ctx, cursor))) {
           buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg);
         }
-      } else {
-        while (ctx->rc == GRN_SUCCESS &&
-               *lseg == NOT_ASSIGNED &&
-               (tid = grn_pat_cursor_next(ctx, cursor))) {
-          void *current_key;
-          int current_key_size;
+        grn_pat_cursor_close(ctx, cursor);
+      }
+    } else {
+      /* For text data */
+      int target_key_size = key_size;
+      int reduced_key_size = 0;
 
-          current_key_size = grn_pat_cursor_get_key(ctx, cursor, &current_key);
-          if (memcmp(((char *)current_key) + target_key_size,
-                     key + target_key_size,
-                     reduced_key_size) == 0) {
-            continue;
+      while (*lseg == NOT_ASSIGNED && target_key_size > 0) {
+        grn_id tid;
+
+        cursor = grn_pat_cursor_open(ctx,
+                                     (grn_pat *)(ii->lexicon),
+                                     key, target_key_size,
+                                     NULL, 0, 0, -1,
+                                     GRN_CURSOR_PREFIX);
+        if (!cursor) {
+          break;
+        }
+
+        if (reduced_key_size == 0) {
+          while (ctx->rc == GRN_SUCCESS &&
+                 *lseg == NOT_ASSIGNED &&
+                 (tid = grn_pat_cursor_next(ctx, cursor))) {
+            buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg);
+          }
+        } else {
+          while (ctx->rc == GRN_SUCCESS &&
+                 *lseg == NOT_ASSIGNED &&
+                 (tid = grn_pat_cursor_next(ctx, cursor))) {
+            void *current_key;
+            int current_key_size;
+
+            current_key_size = grn_pat_cursor_get_key(ctx, cursor, &current_key);
+            if (memcmp(((char *)current_key) + target_key_size,
+                       key + target_key_size,
+                       reduced_key_size) == 0) {
+              continue;
+            }
+            buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg);
           }
-          buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg);
         }
-      }
-      grn_pat_cursor_close(ctx, cursor);
+        grn_pat_cursor_close(ctx, cursor);
 
-      if (reduced_key_size == 0) {
-        reduced_key_size = 1;
-      } else {
-        reduced_key_size *= 2;
+        if (reduced_key_size == 0) {
+          reduced_key_size = 1;
+        } else {
+          reduced_key_size *= 2;
+        }
+        target_key_size -= reduced_key_size;
       }
-      target_key_size -= reduced_key_size;
     }
   } else {
+    /* For other data */
     cursor = grn_pat_cursor_open(ctx,
                                  (grn_pat *)(ii->lexicon),
                                  NULL, 0, key, key_size, 0, -1,
-------------- next part --------------
HTML����������������������������...
下载 


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at 8a84411 [master] ii: use the previous buffer allocation algorithm again