diff --git a/lightrag/kg/milvus_impl.py b/lightrag/kg/milvus_impl.py index ef73a206..a6e1b66e 100644 --- a/lightrag/kg/milvus_impl.py +++ b/lightrag/kg/milvus_impl.py @@ -47,7 +47,7 @@ class MilvusVectorDBStorage(BaseVectorStorage): FieldSchema( name="file_path", dtype=DataType.VARCHAR, - max_length=1024, + max_length=4090, nullable=True, ), ] @@ -64,7 +64,7 @@ class MilvusVectorDBStorage(BaseVectorStorage): FieldSchema( name="file_path", dtype=DataType.VARCHAR, - max_length=1024, + max_length=4090, nullable=True, ), ] diff --git a/lightrag/operate.py b/lightrag/operate.py index a3075210..78e11c22 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -968,16 +968,7 @@ async def _merge_nodes_then_upsert( source_id = GRAPH_FIELD_SEP.join( set([dp["source_id"] for dp in nodes_data] + already_source_ids) ) - file_path = GRAPH_FIELD_SEP.join( - set( - [ - dp.get("file_path", "unknown_source") - for dp in nodes_data - if dp.get("file_path") - ] - + [fp for fp in already_file_paths if fp] - ) - ) + file_path = build_file_path(already_file_paths, nodes_data, entity_name) force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"] @@ -3141,6 +3132,44 @@ async def kg_query_with_keywords( return response +def build_file_path(already_file_paths, data_list, target): + # set: deduplication + file_paths_set = {fp for fp in already_file_paths if fp} + + # string: filter empty value and keep file order in already_file_paths + file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp) + # ignored file_paths + file_paths_ignore = "" + # add file_paths + for dp in data_list: + cur_file_path = dp.get("file_path") + # empty + if not cur_file_path: + continue + + # skip duplicate item + if cur_file_path in file_paths_set: + continue + # add + file_paths_set.add(cur_file_path) + + # check the length + if len(file_paths) + len(GRAPH_FIELD_SEP + cur_file_path) < 4090: + # append + file_paths += ( + GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path + ) + else: + # ignore + file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path + + if file_paths_ignore: + logger.warning( + f"Length of file_path exceeds {target}, ignoring new file: {file_paths_ignore}" + ) + return file_paths + + # TODO: Deprecated, use user_prompt in QueryParam instead async def query_with_keywords( query: str,