Add entity name length truncation with configurable limit
This commit is contained in:
parent
20edd32950
commit
904b1f46f9
2 changed files with 46 additions and 4 deletions
|
|
@ -13,6 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
|
||||||
# Default values for extraction settings
|
# Default values for extraction settings
|
||||||
DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing
|
DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing
|
||||||
DEFAULT_MAX_GLEANING = 1
|
DEFAULT_MAX_GLEANING = 1
|
||||||
|
DEFAULT_ENTITY_NAME_MAX_LENGTH = 256
|
||||||
|
|
||||||
# Number of description fragments to trigger LLM summary
|
# Number of description fragments to trigger LLM summary
|
||||||
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
|
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,7 @@ from lightrag.constants import (
|
||||||
SOURCE_IDS_LIMIT_METHOD_FIFO,
|
SOURCE_IDS_LIMIT_METHOD_FIFO,
|
||||||
DEFAULT_FILE_PATH_MORE_PLACEHOLDER,
|
DEFAULT_FILE_PATH_MORE_PLACEHOLDER,
|
||||||
DEFAULT_MAX_FILE_PATHS,
|
DEFAULT_MAX_FILE_PATHS,
|
||||||
|
DEFAULT_ENTITY_NAME_MAX_LENGTH,
|
||||||
)
|
)
|
||||||
from lightrag.kg.shared_storage import get_storage_keyed_lock
|
from lightrag.kg.shared_storage import get_storage_keyed_lock
|
||||||
import time
|
import time
|
||||||
|
|
@ -69,6 +70,27 @@ from dotenv import load_dotenv
|
||||||
load_dotenv(dotenv_path=".env", override=False)
|
load_dotenv(dotenv_path=".env", override=False)
|
||||||
|
|
||||||
|
|
||||||
|
def _truncate_entity_identifier(
|
||||||
|
identifier: str, limit: int, chunk_key: str, identifier_role: str
|
||||||
|
) -> str:
|
||||||
|
"""Truncate entity identifiers that exceed the configured length limit."""
|
||||||
|
|
||||||
|
if len(identifier) <= limit:
|
||||||
|
return identifier
|
||||||
|
|
||||||
|
display_value = identifier[:limit]
|
||||||
|
preview = identifier[:20] # Show first 20 characters as preview
|
||||||
|
logger.warning(
|
||||||
|
"%s: %s exceeded %d characters (len: %d, preview: '%s...'",
|
||||||
|
chunk_key,
|
||||||
|
identifier_role,
|
||||||
|
limit,
|
||||||
|
len(identifier),
|
||||||
|
preview,
|
||||||
|
)
|
||||||
|
return display_value
|
||||||
|
|
||||||
|
|
||||||
def chunking_by_token_size(
|
def chunking_by_token_size(
|
||||||
tokenizer: Tokenizer,
|
tokenizer: Tokenizer,
|
||||||
content: str,
|
content: str,
|
||||||
|
|
@ -952,7 +974,14 @@ async def _process_extraction_result(
|
||||||
record_attributes, chunk_key, timestamp, file_path
|
record_attributes, chunk_key, timestamp, file_path
|
||||||
)
|
)
|
||||||
if entity_data is not None:
|
if entity_data is not None:
|
||||||
maybe_nodes[entity_data["entity_name"]].append(entity_data)
|
truncated_name = _truncate_entity_identifier(
|
||||||
|
entity_data["entity_name"],
|
||||||
|
DEFAULT_ENTITY_NAME_MAX_LENGTH,
|
||||||
|
chunk_key,
|
||||||
|
"Entity name",
|
||||||
|
)
|
||||||
|
entity_data["entity_name"] = truncated_name
|
||||||
|
maybe_nodes[truncated_name].append(entity_data)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Try to parse as relationship
|
# Try to parse as relationship
|
||||||
|
|
@ -960,9 +989,21 @@ async def _process_extraction_result(
|
||||||
record_attributes, chunk_key, timestamp, file_path
|
record_attributes, chunk_key, timestamp, file_path
|
||||||
)
|
)
|
||||||
if relationship_data is not None:
|
if relationship_data is not None:
|
||||||
maybe_edges[
|
truncated_source = _truncate_entity_identifier(
|
||||||
(relationship_data["src_id"], relationship_data["tgt_id"])
|
relationship_data["src_id"],
|
||||||
].append(relationship_data)
|
DEFAULT_ENTITY_NAME_MAX_LENGTH,
|
||||||
|
chunk_key,
|
||||||
|
"Relationship source entity",
|
||||||
|
)
|
||||||
|
truncated_target = _truncate_entity_identifier(
|
||||||
|
relationship_data["tgt_id"],
|
||||||
|
DEFAULT_ENTITY_NAME_MAX_LENGTH,
|
||||||
|
chunk_key,
|
||||||
|
"Relationship target entity",
|
||||||
|
)
|
||||||
|
relationship_data["src_id"] = truncated_source
|
||||||
|
relationship_data["tgt_id"] = truncated_target
|
||||||
|
maybe_edges[(truncated_source, truncated_target)].append(relationship_data)
|
||||||
|
|
||||||
return dict(maybe_nodes), dict(maybe_edges)
|
return dict(maybe_nodes), dict(maybe_edges)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue