From a3f871a144246bc05e7dd7dd9d4d42ad44551f02 Mon Sep 17 00:00:00 2001
From: dnosferatus <davidnosferatus@gmail.com>
Date: Tue, 2 Dec 2025 23:59:36 -0600
Subject: [PATCH] Fix: Correct pagination and early termination bugs in
 chunk_list()

- Fix pagination offset calculation (page_num * bs instead of p)
- Fix early loop termination (break only when zero chunks, not < bs)
- Add max_count parameter to GraphRAG chunk loading

These bugs prevented processing large documents (>128 chunks) in GraphRAG
and other workflows. Validated by testing with 3,207 chunk document.

Fixes #11687
---
 graphrag/general/index.py | 12 +++++++++---
 rag/nlp/search.py         |  7 ++++---
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/graphrag/general/index.py b/graphrag/general/index.py
index 12b39400e..f307e5d91 100644
--- a/graphrag/general/index.py
+++ b/graphrag/general/index.py
@@ -57,7 +57,7 @@ async def run_graphrag(
     start = trio.current_time()
     tenant_id, kb_id, doc_id = row["tenant_id"], str(row["kb_id"]), row["doc_id"]
     chunks = []
-    for d in settings.retriever.chunk_list(doc_id, tenant_id, [kb_id], fields=["content_with_weight", "doc_id"], sort_by_position=True):
+    for d in settings.retriever.chunk_list(doc_id, tenant_id, [kb_id], max_count=10000, fields=["content_with_weight", "doc_id"], sort_by_position=True):
         chunks.append(d["content_with_weight"])
 
     with trio.fail_after(max(120, len(chunks) * 60 * 10) if enable_timeout_assertion else 10000000000):
@@ -174,13 +174,19 @@ async def run_graphrag_for_kb(
         chunks = []
         current_chunk = ""
 
-        for d in settings.retriever.chunk_list(
+        # DEBUG: Obtener todos los chunks primero
+        raw_chunks = list(settings.retriever.chunk_list(
             doc_id,
             tenant_id,
             [kb_id],
+            max_count=10000,  # FIX: Aumentar límite para procesar todos los chunks
             fields=fields_for_chunks,
             sort_by_position=True,
-        ):
+        ))
+
+        callback(msg=f"[DEBUG] chunk_list() returned {len(raw_chunks)} raw chunks for doc {doc_id}")
+
+        for d in raw_chunks:
             content = d["content_with_weight"]
             if num_tokens_from_string(current_chunk + content) < 1024:
                 current_chunk += content
diff --git a/rag/nlp/search.py b/rag/nlp/search.py
index a479e5d3f..c67d2d422 100644
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -527,15 +527,16 @@ class Dealer:
 
         res = []
         bs = 128
-        for p in range(offset, max_count, bs):
-            es_res = self.dataStore.search(fields, [], condition, [], orderBy, p, bs, index_name(tenant_id),
+        for page_num, p in enumerate(range(offset, max_count, bs)):
+            es_res = self.dataStore.search(fields, [], condition, [], orderBy, page_num * bs, bs, index_name(tenant_id),
                                            kb_ids)
             dict_chunks = self.dataStore.get_fields(es_res, fields)
             for id, doc in dict_chunks.items():
                 doc["id"] = id
             if dict_chunks:
                 res.extend(dict_chunks.values())
-            if len(dict_chunks.values()) < bs:
+            # FIX: Solo terminar si no hay chunks, no si hay menos de bs
+            if len(dict_chunks.values()) == 0:
                 break
         return res