From 7acca59dfb2af1301f56ea127e5da7b6f093fa86 Mon Sep 17 00:00:00 2001 From: xuewei <728857235@qq.com> Date: Sat, 19 Jul 2025 17:27:28 +0800 Subject: [PATCH 1/2] Improve query for find_text_unit --- lightrag/operate.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 4157a9e8..183798b1 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -2721,6 +2721,18 @@ async def _find_related_text_unit_from_relationships( ] all_text_units_lookup = {} + # Deduplicate and preserve order | {c_id:order} + async def build_text_units_unique(text_units): + text_units_flat = {} + for index, unit_list in enumerate(text_units): + for c_id in unit_list: + if c_id not in text_units_flat or index < text_units_flat[c_id]: + # Keep the smallest order + text_units_flat[c_id] = index + return text_units_flat + + text_units_unique_flat = build_text_units_unique(text_units) + async def fetch_chunk_data(c_id, index): if c_id not in all_text_units_lookup: chunk_data = await text_chunks_db.get_by_id(c_id) @@ -2731,11 +2743,9 @@ async def _find_related_text_unit_from_relationships( "order": index, } - tasks = [] - for index, unit_list in enumerate(text_units): - for c_id in unit_list: - tasks.append(fetch_chunk_data(c_id, index)) - + tasks = [ + fetch_chunk_data(c_id, order) for c_id, order in text_units_unique_flat.items() + ] await asyncio.gather(*tasks) if not all_text_units_lookup: From 7e3914052d4117be4a12676115460479c96570d6 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sat, 19 Jul 2025 21:01:03 +0800 Subject: [PATCH 2/2] Optimize text chunk retrieval with batch fetching - Replace individual chunk fetches with batch get - Simplify deduplication logic - Improve error handling for missing data --- lightrag/operate.py | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 183798b1..a0418174 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -2722,31 +2722,31 @@ async def _find_related_text_unit_from_relationships( all_text_units_lookup = {} # Deduplicate and preserve order | {c_id:order} - async def build_text_units_unique(text_units): - text_units_flat = {} - for index, unit_list in enumerate(text_units): - for c_id in unit_list: - if c_id not in text_units_flat or index < text_units_flat[c_id]: - # Keep the smallest order - text_units_flat[c_id] = index - return text_units_flat + text_units_unique_flat = {} + for index, unit_list in enumerate(text_units): + for c_id in unit_list: + if ( + c_id not in text_units_unique_flat + or index < text_units_unique_flat[c_id] + ): + # Keep the smallest order + text_units_unique_flat[c_id] = index - text_units_unique_flat = build_text_units_unique(text_units) + if not text_units_unique_flat: + logger.warning("No valid text chunks found") + return [] - async def fetch_chunk_data(c_id, index): - if c_id not in all_text_units_lookup: - chunk_data = await text_chunks_db.get_by_id(c_id) - # Only store valid data - if chunk_data is not None and "content" in chunk_data: - all_text_units_lookup[c_id] = { - "data": chunk_data, - "order": index, - } + # Batch get all text chunk data + chunk_ids = list(text_units_unique_flat.keys()) + chunk_data_list = await text_chunks_db.get_by_ids(chunk_ids) - tasks = [ - fetch_chunk_data(c_id, order) for c_id, order in text_units_unique_flat.items() - ] - await asyncio.gather(*tasks) + # Build lookup table, handling possible missing data + for chunk_id, chunk_data in zip(chunk_ids, chunk_data_list): + if chunk_data is not None and "content" in chunk_data: + all_text_units_lookup[chunk_id] = { + "data": chunk_data, + "order": text_units_unique_flat[chunk_id], + } if not all_text_units_lookup: logger.warning("No valid text chunks found")