Fix: Correct pagination and early termination bugs in chunk_list()

- Fix pagination offset calculation (page_num * bs instead of p)
- Fix early loop termination (break only when zero chunks, not < bs)
- Add max_count parameter to GraphRAG chunk loading

These bugs prevented processing large documents (>128 chunks) in GraphRAG
and other workflows. Validated by testing with 3,207 chunk document.

Fixes #11687
This commit is contained in:
dnosferatus 2025-12-02 23:59:36 -06:00
parent e0e1d04da5
commit a3f871a144
2 changed files with 13 additions and 6 deletions

View file

@ -57,7 +57,7 @@ async def run_graphrag(
start = trio.current_time()
tenant_id, kb_id, doc_id = row["tenant_id"], str(row["kb_id"]), row["doc_id"]
chunks = []
for d in settings.retriever.chunk_list(doc_id, tenant_id, [kb_id], fields=["content_with_weight", "doc_id"], sort_by_position=True):
for d in settings.retriever.chunk_list(doc_id, tenant_id, [kb_id], max_count=10000, fields=["content_with_weight", "doc_id"], sort_by_position=True):
chunks.append(d["content_with_weight"])
with trio.fail_after(max(120, len(chunks) * 60 * 10) if enable_timeout_assertion else 10000000000):
@ -174,13 +174,19 @@ async def run_graphrag_for_kb(
chunks = []
current_chunk = ""
for d in settings.retriever.chunk_list(
# DEBUG: Obtener todos los chunks primero
raw_chunks = list(settings.retriever.chunk_list(
doc_id,
tenant_id,
[kb_id],
max_count=10000, # FIX: Aumentar límite para procesar todos los chunks
fields=fields_for_chunks,
sort_by_position=True,
):
))
callback(msg=f"[DEBUG] chunk_list() returned {len(raw_chunks)} raw chunks for doc {doc_id}")
for d in raw_chunks:
content = d["content_with_weight"]
if num_tokens_from_string(current_chunk + content) < 1024:
current_chunk += content

View file

@ -527,15 +527,16 @@ class Dealer:
res = []
bs = 128
for p in range(offset, max_count, bs):
es_res = self.dataStore.search(fields, [], condition, [], orderBy, p, bs, index_name(tenant_id),
for page_num, p in enumerate(range(offset, max_count, bs)):
es_res = self.dataStore.search(fields, [], condition, [], orderBy, page_num * bs, bs, index_name(tenant_id),
kb_ids)
dict_chunks = self.dataStore.get_fields(es_res, fields)
for id, doc in dict_chunks.items():
doc["id"] = id
if dict_chunks:
res.extend(dict_chunks.values())
if len(dict_chunks.values()) < bs:
# FIX: Solo terminar si no hay chunks, no si hay menos de bs
if len(dict_chunks.values()) == 0:
break
return res