diff --git a/lightrag/operate.py b/lightrag/operate.py index d25a899f..10cb251e 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -28,6 +28,7 @@ from .utils import ( remove_think_tags, linear_gradient_weighted_polling, process_chunks_unified, + build_file_path, ) from .base import ( BaseGraphStorage, @@ -43,7 +44,6 @@ from .constants import ( DEFAULT_MAX_RELATION_TOKENS, DEFAULT_MAX_TOTAL_TOKENS, DEFAULT_RELATED_CHUNK_NUMBER, - DEFAULT_MAX_FILE_PATH_LENGTH, ) from .kg.shared_storage import get_storage_keyed_lock import time @@ -3133,47 +3133,6 @@ async def kg_query_with_keywords( return response -def build_file_path(already_file_paths, data_list, target): - # set: deduplication - file_paths_set = {fp for fp in already_file_paths if fp} - - # string: filter empty value and keep file order in already_file_paths - file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp) - # ignored file_paths - file_paths_ignore = "" - # add file_paths - for dp in data_list: - cur_file_path = dp.get("file_path") - # empty - if not cur_file_path: - continue - - # skip duplicate item - if cur_file_path in file_paths_set: - continue - # add - file_paths_set.add(cur_file_path) - - # check the length - if ( - len(file_paths) + len(GRAPH_FIELD_SEP + cur_file_path) - < DEFAULT_MAX_FILE_PATH_LENGTH - ): - # append - file_paths += ( - GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path - ) - else: - # ignore - file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path - - if file_paths_ignore: - logger.warning( - f"Length of file_path exceeds {target}, ignoring new file: {file_paths_ignore}" - ) - return file_paths - - # TODO: Deprecated, use user_prompt in QueryParam instead async def query_with_keywords( query: str, diff --git a/lightrag/utils.py b/lightrag/utils.py index bd5aeab2..657b6953 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -19,6 +19,8 @@ from lightrag.constants import ( DEFAULT_LOG_MAX_BYTES, DEFAULT_LOG_BACKUP_COUNT, DEFAULT_LOG_FILENAME, + GRAPH_FIELD_SEP, + DEFAULT_MAX_FILE_PATH_LENGTH, ) @@ -1901,3 +1903,54 @@ async def process_chunks_unified( ) return unique_chunks + + +def build_file_path(already_file_paths, data_list, target): + """Build file path string with length limit and deduplication + + Args: + already_file_paths: List of existing file paths + data_list: List of data items containing file_path + target: Target name for logging warnings + + Returns: + str: Combined file paths separated by GRAPH_FIELD_SEP + """ + # set: deduplication + file_paths_set = {fp for fp in already_file_paths if fp} + + # string: filter empty value and keep file order in already_file_paths + file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp) + # ignored file_paths + file_paths_ignore = "" + # add file_paths + for dp in data_list: + cur_file_path = dp.get("file_path") + # empty + if not cur_file_path: + continue + + # skip duplicate item + if cur_file_path in file_paths_set: + continue + # add + file_paths_set.add(cur_file_path) + + # check the length + if ( + len(file_paths) + len(GRAPH_FIELD_SEP + cur_file_path) + < DEFAULT_MAX_FILE_PATH_LENGTH + ): + # append + file_paths += ( + GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path + ) + else: + # ignore + file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path + + if file_paths_ignore: + logger.warning( + f"Length of file_path exceeds {target}, ignoring new file: {file_paths_ignore}" + ) + return file_paths