Refactor: move build_file_path function from operate.py to utils.py
This commit is contained in:
parent
c8c3545454
commit
7b915b34f6
2 changed files with 54 additions and 42 deletions
|
|
@ -28,6 +28,7 @@ from .utils import (
|
||||||
remove_think_tags,
|
remove_think_tags,
|
||||||
linear_gradient_weighted_polling,
|
linear_gradient_weighted_polling,
|
||||||
process_chunks_unified,
|
process_chunks_unified,
|
||||||
|
build_file_path,
|
||||||
)
|
)
|
||||||
from .base import (
|
from .base import (
|
||||||
BaseGraphStorage,
|
BaseGraphStorage,
|
||||||
|
|
@ -43,7 +44,6 @@ from .constants import (
|
||||||
DEFAULT_MAX_RELATION_TOKENS,
|
DEFAULT_MAX_RELATION_TOKENS,
|
||||||
DEFAULT_MAX_TOTAL_TOKENS,
|
DEFAULT_MAX_TOTAL_TOKENS,
|
||||||
DEFAULT_RELATED_CHUNK_NUMBER,
|
DEFAULT_RELATED_CHUNK_NUMBER,
|
||||||
DEFAULT_MAX_FILE_PATH_LENGTH,
|
|
||||||
)
|
)
|
||||||
from .kg.shared_storage import get_storage_keyed_lock
|
from .kg.shared_storage import get_storage_keyed_lock
|
||||||
import time
|
import time
|
||||||
|
|
@ -3133,47 +3133,6 @@ async def kg_query_with_keywords(
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
def build_file_path(already_file_paths, data_list, target):
|
|
||||||
# set: deduplication
|
|
||||||
file_paths_set = {fp for fp in already_file_paths if fp}
|
|
||||||
|
|
||||||
# string: filter empty value and keep file order in already_file_paths
|
|
||||||
file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
|
|
||||||
# ignored file_paths
|
|
||||||
file_paths_ignore = ""
|
|
||||||
# add file_paths
|
|
||||||
for dp in data_list:
|
|
||||||
cur_file_path = dp.get("file_path")
|
|
||||||
# empty
|
|
||||||
if not cur_file_path:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# skip duplicate item
|
|
||||||
if cur_file_path in file_paths_set:
|
|
||||||
continue
|
|
||||||
# add
|
|
||||||
file_paths_set.add(cur_file_path)
|
|
||||||
|
|
||||||
# check the length
|
|
||||||
if (
|
|
||||||
len(file_paths) + len(GRAPH_FIELD_SEP + cur_file_path)
|
|
||||||
< DEFAULT_MAX_FILE_PATH_LENGTH
|
|
||||||
):
|
|
||||||
# append
|
|
||||||
file_paths += (
|
|
||||||
GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# ignore
|
|
||||||
file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path
|
|
||||||
|
|
||||||
if file_paths_ignore:
|
|
||||||
logger.warning(
|
|
||||||
f"Length of file_path exceeds {target}, ignoring new file: {file_paths_ignore}"
|
|
||||||
)
|
|
||||||
return file_paths
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: Deprecated, use user_prompt in QueryParam instead
|
# TODO: Deprecated, use user_prompt in QueryParam instead
|
||||||
async def query_with_keywords(
|
async def query_with_keywords(
|
||||||
query: str,
|
query: str,
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,8 @@ from lightrag.constants import (
|
||||||
DEFAULT_LOG_MAX_BYTES,
|
DEFAULT_LOG_MAX_BYTES,
|
||||||
DEFAULT_LOG_BACKUP_COUNT,
|
DEFAULT_LOG_BACKUP_COUNT,
|
||||||
DEFAULT_LOG_FILENAME,
|
DEFAULT_LOG_FILENAME,
|
||||||
|
GRAPH_FIELD_SEP,
|
||||||
|
DEFAULT_MAX_FILE_PATH_LENGTH,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1901,3 +1903,54 @@ async def process_chunks_unified(
|
||||||
)
|
)
|
||||||
|
|
||||||
return unique_chunks
|
return unique_chunks
|
||||||
|
|
||||||
|
|
||||||
|
def build_file_path(already_file_paths, data_list, target):
|
||||||
|
"""Build file path string with length limit and deduplication
|
||||||
|
|
||||||
|
Args:
|
||||||
|
already_file_paths: List of existing file paths
|
||||||
|
data_list: List of data items containing file_path
|
||||||
|
target: Target name for logging warnings
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Combined file paths separated by GRAPH_FIELD_SEP
|
||||||
|
"""
|
||||||
|
# set: deduplication
|
||||||
|
file_paths_set = {fp for fp in already_file_paths if fp}
|
||||||
|
|
||||||
|
# string: filter empty value and keep file order in already_file_paths
|
||||||
|
file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
|
||||||
|
# ignored file_paths
|
||||||
|
file_paths_ignore = ""
|
||||||
|
# add file_paths
|
||||||
|
for dp in data_list:
|
||||||
|
cur_file_path = dp.get("file_path")
|
||||||
|
# empty
|
||||||
|
if not cur_file_path:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# skip duplicate item
|
||||||
|
if cur_file_path in file_paths_set:
|
||||||
|
continue
|
||||||
|
# add
|
||||||
|
file_paths_set.add(cur_file_path)
|
||||||
|
|
||||||
|
# check the length
|
||||||
|
if (
|
||||||
|
len(file_paths) + len(GRAPH_FIELD_SEP + cur_file_path)
|
||||||
|
< DEFAULT_MAX_FILE_PATH_LENGTH
|
||||||
|
):
|
||||||
|
# append
|
||||||
|
file_paths += (
|
||||||
|
GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# ignore
|
||||||
|
file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path
|
||||||
|
|
||||||
|
if file_paths_ignore:
|
||||||
|
logger.warning(
|
||||||
|
f"Length of file_path exceeds {target}, ignoring new file: {file_paths_ignore}"
|
||||||
|
)
|
||||||
|
return file_paths
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue