From 55e2678a1e2aa5b4c4c3493476b20ce1a2eeb734 Mon Sep 17 00:00:00 2001 From: xuewei <728857235@qq.com> Date: Sat, 26 Jul 2025 00:22:25 +0800 Subject: [PATCH 1/5] Improve file_path FieldSchema 4090 --- lightrag/kg/milvus_impl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/kg/milvus_impl.py b/lightrag/kg/milvus_impl.py index ef73a206..a6e1b66e 100644 --- a/lightrag/kg/milvus_impl.py +++ b/lightrag/kg/milvus_impl.py @@ -47,7 +47,7 @@ class MilvusVectorDBStorage(BaseVectorStorage): FieldSchema( name="file_path", dtype=DataType.VARCHAR, - max_length=1024, + max_length=4090, nullable=True, ), ] @@ -64,7 +64,7 @@ class MilvusVectorDBStorage(BaseVectorStorage): FieldSchema( name="file_path", dtype=DataType.VARCHAR, - max_length=1024, + max_length=4090, nullable=True, ), ] From b4da3de7d95606510543127b7863a53a951c0e39 Mon Sep 17 00:00:00 2001 From: xuewei <728857235@qq.com> Date: Sat, 26 Jul 2025 00:46:02 +0800 Subject: [PATCH 2/5] Improve file_path drop policy --- lightrag/operate.py | 51 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index a3075210..0c56018f 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -968,16 +968,7 @@ async def _merge_nodes_then_upsert( source_id = GRAPH_FIELD_SEP.join( set([dp["source_id"] for dp in nodes_data] + already_source_ids) ) - file_path = GRAPH_FIELD_SEP.join( - set( - [ - dp.get("file_path", "unknown_source") - for dp in nodes_data - if dp.get("file_path") - ] - + [fp for fp in already_file_paths if fp] - ) - ) + file_path = build_file_path(already_file_paths, nodes_data, entity_name) force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"] @@ -3141,6 +3132,46 @@ async def kg_query_with_keywords( return response +def build_file_path(already_file_paths, data_list, target): + # set: deduplication + file_paths_set = {fp for fp in already_file_paths if fp} + + # string: deduplication sorted + file_paths = GRAPH_FIELD_SEP.join( + list(dict.fromkeys(fp for fp in already_file_paths if fp)) + ) + # ignored file_paths + file_paths_ignore = "" + # add file_paths + for dp in data_list: + cur_file_path = dp.get("file_path") + # empty + if not cur_file_path: + continue + + # skip duplicate item + if cur_file_path in file_paths_set: + continue + # add + file_paths_set.add(cur_file_path) + + # check the length + if len(file_paths) + len(GRAPH_FIELD_SEP + cur_file_path) < 4090: + # append + file_paths += ( + GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path + ) + else: + # ignore + file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path + + if file_paths_ignore: + print( + f"length of varchar field file_path exceeds max length target={target} ignore={file_paths_ignore}" + ) + return file_paths + + # TODO: Deprecated, use user_prompt in QueryParam instead async def query_with_keywords( query: str, From 56c3cb2dbe9c4f7774c3cf637f2801f3488f2c30 Mon Sep 17 00:00:00 2001 From: xuewei <728857235@qq.com> Date: Sat, 26 Jul 2025 08:38:02 +0800 Subject: [PATCH 3/5] Improve build_file_path log --- lightrag/operate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 0c56018f..52e37099 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -3166,7 +3166,7 @@ def build_file_path(already_file_paths, data_list, target): file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path if file_paths_ignore: - print( + logger.debug( f"length of varchar field file_path exceeds max length target={target} ignore={file_paths_ignore}" ) return file_paths From 6efa8ab263047ab7ccd7b8aba5aa95b3102c7a2f Mon Sep 17 00:00:00 2001 From: yangdx Date: Sat, 26 Jul 2025 10:00:18 +0800 Subject: [PATCH 4/5] Improve file path length warning message clarity and urgency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Change debug to warning level • Simplify message wording --- lightrag/operate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 52e37099..94d6b07c 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -3166,8 +3166,8 @@ def build_file_path(already_file_paths, data_list, target): file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path if file_paths_ignore: - logger.debug( - f"length of varchar field file_path exceeds max length target={target} ignore={file_paths_ignore}" + logger.warning( + f"Length of file_path exceeds {target}, ignoring new file: {file_paths_ignore}" ) return file_paths From a9432652573f63077ceb1abbe8d901f765bac6ac Mon Sep 17 00:00:00 2001 From: yangdx Date: Sat, 26 Jul 2025 10:21:32 +0800 Subject: [PATCH 5/5] fix: preserve file path order in build_file_path function --- lightrag/operate.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 94d6b07c..78e11c22 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -3136,10 +3136,8 @@ def build_file_path(already_file_paths, data_list, target): # set: deduplication file_paths_set = {fp for fp in already_file_paths if fp} - # string: deduplication sorted - file_paths = GRAPH_FIELD_SEP.join( - list(dict.fromkeys(fp for fp in already_file_paths if fp)) - ) + # string: filter empty value and keep file order in already_file_paths + file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp) # ignored file_paths file_paths_ignore = "" # add file_paths