Merge branch 'separator_file_path'

This commit is contained in:
yangdx 2025-07-26 10:39:03 +08:00
commit 8e7014d366
2 changed files with 41 additions and 12 deletions

View file

@ -47,7 +47,7 @@ class MilvusVectorDBStorage(BaseVectorStorage):
FieldSchema(
name="file_path",
dtype=DataType.VARCHAR,
max_length=1024,
max_length=4090,
nullable=True,
),
]
@ -64,7 +64,7 @@ class MilvusVectorDBStorage(BaseVectorStorage):
FieldSchema(
name="file_path",
dtype=DataType.VARCHAR,
max_length=1024,
max_length=4090,
nullable=True,
),
]

View file

@ -968,16 +968,7 @@ async def _merge_nodes_then_upsert(
source_id = GRAPH_FIELD_SEP.join(
set([dp["source_id"] for dp in nodes_data] + already_source_ids)
)
file_path = GRAPH_FIELD_SEP.join(
set(
[
dp.get("file_path", "unknown_source")
for dp in nodes_data
if dp.get("file_path")
]
+ [fp for fp in already_file_paths if fp]
)
)
file_path = build_file_path(already_file_paths, nodes_data, entity_name)
force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"]
@ -3141,6 +3132,44 @@ async def kg_query_with_keywords(
return response
def build_file_path(already_file_paths, data_list, target):
# set: deduplication
file_paths_set = {fp for fp in already_file_paths if fp}
# string: filter empty value and keep file order in already_file_paths
file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
# ignored file_paths
file_paths_ignore = ""
# add file_paths
for dp in data_list:
cur_file_path = dp.get("file_path")
# empty
if not cur_file_path:
continue
# skip duplicate item
if cur_file_path in file_paths_set:
continue
# add
file_paths_set.add(cur_file_path)
# check the length
if len(file_paths) + len(GRAPH_FIELD_SEP + cur_file_path) < 4090:
# append
file_paths += (
GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path
)
else:
# ignore
file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path
if file_paths_ignore:
logger.warning(
f"Length of file_path exceeds {target}, ignoring new file: {file_paths_ignore}"
)
return file_paths
# TODO: Deprecated, use user_prompt in QueryParam instead
async def query_with_keywords(
query: str,