Merge branch 'separator_file_path'

2025-07-26 10:39:03 +08:00 · 2025-07-26 10:39:03 +08:00 · 8e7014d366
commit 8e7014d366
parent 2ed046171e a943265257
2 changed files with 41 additions and 12 deletions
--- a/lightrag/kg/milvus_impl.py
+++ b/lightrag/kg/milvus_impl.py
@ -47,7 +47,7 @@ class MilvusVectorDBStorage(BaseVectorStorage):
                FieldSchema(
                    name="file_path",
                    dtype=DataType.VARCHAR,
-                    max_length=1024,
+                    max_length=4090,
                    nullable=True,
                ),
            ]
@ -64,7 +64,7 @@ class MilvusVectorDBStorage(BaseVectorStorage):
                FieldSchema(
                    name="file_path",
                    dtype=DataType.VARCHAR,
-                    max_length=1024,
+                    max_length=4090,
                    nullable=True,
                ),
            ]
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -968,16 +968,7 @@ async def _merge_nodes_then_upsert(
    source_id = GRAPH_FIELD_SEP.join(
        set([dp["source_id"] for dp in nodes_data] + already_source_ids)
    )
-    file_path = GRAPH_FIELD_SEP.join(
+    file_path = build_file_path(already_file_paths, nodes_data, entity_name)
        set(
            [
                dp.get("file_path", "unknown_source")
                for dp in nodes_data
                if dp.get("file_path")
            ]
            + [fp for fp in already_file_paths if fp]
        )
    )
    force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"]
@ -3141,6 +3132,44 @@ async def kg_query_with_keywords(
    return response
 def build_file_path(already_file_paths, data_list, target):
    # set: deduplication
    file_paths_set = {fp for fp in already_file_paths if fp}
    # string: filter empty value and keep file order in already_file_paths
    file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
    # ignored file_paths
    file_paths_ignore = ""
    # add file_paths
    for dp in data_list:
        cur_file_path = dp.get("file_path")
        # empty
        if not cur_file_path:
            continue
        # skip duplicate item
        if cur_file_path in file_paths_set:
            continue
        # add
        file_paths_set.add(cur_file_path)
        # check the length
        if len(file_paths) + len(GRAPH_FIELD_SEP + cur_file_path) < 4090:
            # append
            file_paths += (
                GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path
            )
        else:
            # ignore
            file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path
    if file_paths_ignore:
        logger.warning(
            f"Length of file_path exceeds {target}, ignoring new file: {file_paths_ignore}"
        )
    return file_paths
 # TODO: Deprecated, use user_prompt in QueryParam instead
 async def query_with_keywords(
    query: str,