From a3f871a144246bc05e7dd7dd9d4d42ad44551f02 Mon Sep 17 00:00:00 2001 From: dnosferatus Date: Tue, 2 Dec 2025 23:59:36 -0600 Subject: [PATCH] Fix: Correct pagination and early termination bugs in chunk_list() - Fix pagination offset calculation (page_num * bs instead of p) - Fix early loop termination (break only when zero chunks, not < bs) - Add max_count parameter to GraphRAG chunk loading These bugs prevented processing large documents (>128 chunks) in GraphRAG and other workflows. Validated by testing with 3,207 chunk document. Fixes #11687 --- graphrag/general/index.py | 12 +++++++++--- rag/nlp/search.py | 7 ++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/graphrag/general/index.py b/graphrag/general/index.py index 12b39400e..f307e5d91 100644 --- a/graphrag/general/index.py +++ b/graphrag/general/index.py @@ -57,7 +57,7 @@ async def run_graphrag( start = trio.current_time() tenant_id, kb_id, doc_id = row["tenant_id"], str(row["kb_id"]), row["doc_id"] chunks = [] - for d in settings.retriever.chunk_list(doc_id, tenant_id, [kb_id], fields=["content_with_weight", "doc_id"], sort_by_position=True): + for d in settings.retriever.chunk_list(doc_id, tenant_id, [kb_id], max_count=10000, fields=["content_with_weight", "doc_id"], sort_by_position=True): chunks.append(d["content_with_weight"]) with trio.fail_after(max(120, len(chunks) * 60 * 10) if enable_timeout_assertion else 10000000000): @@ -174,13 +174,19 @@ async def run_graphrag_for_kb( chunks = [] current_chunk = "" - for d in settings.retriever.chunk_list( + # DEBUG: Obtener todos los chunks primero + raw_chunks = list(settings.retriever.chunk_list( doc_id, tenant_id, [kb_id], + max_count=10000, # FIX: Aumentar lĂ­mite para procesar todos los chunks fields=fields_for_chunks, sort_by_position=True, - ): + )) + + callback(msg=f"[DEBUG] chunk_list() returned {len(raw_chunks)} raw chunks for doc {doc_id}") + + for d in raw_chunks: content = d["content_with_weight"] if num_tokens_from_string(current_chunk + content) < 1024: current_chunk += content diff --git a/rag/nlp/search.py b/rag/nlp/search.py index a479e5d3f..c67d2d422 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -527,15 +527,16 @@ class Dealer: res = [] bs = 128 - for p in range(offset, max_count, bs): - es_res = self.dataStore.search(fields, [], condition, [], orderBy, p, bs, index_name(tenant_id), + for page_num, p in enumerate(range(offset, max_count, bs)): + es_res = self.dataStore.search(fields, [], condition, [], orderBy, page_num * bs, bs, index_name(tenant_id), kb_ids) dict_chunks = self.dataStore.get_fields(es_res, fields) for id, doc in dict_chunks.items(): doc["id"] = id if dict_chunks: res.extend(dict_chunks.values()) - if len(dict_chunks.values()) < bs: + # FIX: Solo terminar si no hay chunks, no si hay menos de bs + if len(dict_chunks.values()) == 0: break return res