Fix: Correct pagination and early termination bugs in chunk_list()

- Fix pagination offset calculation (page_num * bs instead of p) - Fix early loop termination (break only when zero chunks, not < bs) - Add max_count parameter to GraphRAG chunk loading These bugs prevented processing large documents (>128 chunks) in GraphRAG and other workflows. Validated by testing with 3,207 chunk document. Fixes #11687
2025-12-02 23:59:36 -06:00 · 2025-12-02 23:59:36 -06:00 · a3f871a144
commit a3f871a144
parent e0e1d04da5
2 changed files with 13 additions and 6 deletions
--- a/graphrag/general/index.py
+++ b/graphrag/general/index.py
@ -57,7 +57,7 @@ async def run_graphrag(
    start = trio.current_time()
    tenant_id, kb_id, doc_id = row["tenant_id"], str(row["kb_id"]), row["doc_id"]
    chunks = []
-    for d in settings.retriever.chunk_list(doc_id, tenant_id, [kb_id], fields=["content_with_weight", "doc_id"], sort_by_position=True):
+    for d in settings.retriever.chunk_list(doc_id, tenant_id, [kb_id], max_count=10000, fields=["content_with_weight", "doc_id"], sort_by_position=True):
        chunks.append(d["content_with_weight"])

    with trio.fail_after(max(120, len(chunks) * 60 * 10) if enable_timeout_assertion else 10000000000):
@ -174,13 +174,19 @@ async def run_graphrag_for_kb(
        chunks = []
        current_chunk = ""

-        for d in settings.retriever.chunk_list(
+        # DEBUG: Obtener todos los chunks primero
+        raw_chunks = list(settings.retriever.chunk_list(
            doc_id,
            tenant_id,
            [kb_id],
+            max_count=10000,  # FIX: Aumentar límite para procesar todos los chunks
            fields=fields_for_chunks,
            sort_by_position=True,
-        ):
+        ))
+
+        callback(msg=f"[DEBUG] chunk_list() returned {len(raw_chunks)} raw chunks for doc {doc_id}")
+
+        for d in raw_chunks:
            content = d["content_with_weight"]
            if num_tokens_from_string(current_chunk + content) < 1024:
                current_chunk += content
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -527,15 +527,16 @@ class Dealer:

        res = []
        bs = 128
-        for p in range(offset, max_count, bs):
-            es_res = self.dataStore.search(fields, [], condition, [], orderBy, p, bs, index_name(tenant_id),
+        for page_num, p in enumerate(range(offset, max_count, bs)):
+            es_res = self.dataStore.search(fields, [], condition, [], orderBy, page_num * bs, bs, index_name(tenant_id),
                                           kb_ids)
            dict_chunks = self.dataStore.get_fields(es_res, fields)
            for id, doc in dict_chunks.items():
                doc["id"] = id
            if dict_chunks:
                res.extend(dict_chunks.values())
-            if len(dict_chunks.values()) < bs:
+            # FIX: Solo terminar si no hay chunks, no si hay menos de bs
+            if len(dict_chunks.values()) == 0:
                break
        return res