Merge pull request #2245 from danielaskdd/entity-name-len

Refact: Add Entity Identifier Length Truncation to Prevent Storage Failures
2025-10-22 15:02:02 +08:00 · 2025-10-22 15:02:02 +08:00 · cf2174b9d7
commit cf2174b9d7
parent 3ba1d75c97 c92ab8373a
3 changed files with 47 additions and 5 deletions
--- a/env.example
+++ b/env.example
@ -138,7 +138,7 @@ SUMMARY_LANGUAGE=English
 ### control the maximum chunk_ids stored in vector and graph db
 # MAX_SOURCE_IDS_PER_ENTITY=300
 # MAX_SOURCE_IDS_PER_RELATION=300
-### control chunk_ids limitation method: FIFO, FIFO
+### control chunk_ids limitation method: FIFO, KEEP
 ###    FIFO: First in first out
 ###    KEEP: Keep oldest (less merge action and faster)
 # SOURCE_IDS_LIMIT_METHOD=FIFO
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@ -13,6 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
 # Default values for extraction settings
 DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
 DEFAULT_MAX_GLEANING = 1
+DEFAULT_ENTITY_NAME_MAX_LENGTH = 256

 # Number of description fragments to trigger LLM summary
 DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -58,6 +58,7 @@ from lightrag.constants import (
    SOURCE_IDS_LIMIT_METHOD_FIFO,
    DEFAULT_FILE_PATH_MORE_PLACEHOLDER,
    DEFAULT_MAX_FILE_PATHS,
+    DEFAULT_ENTITY_NAME_MAX_LENGTH,
 )
 from lightrag.kg.shared_storage import get_storage_keyed_lock
 import time
@ -69,6 +70,27 @@ from dotenv import load_dotenv
 load_dotenv(dotenv_path=".env", override=False)


+def _truncate_entity_identifier(
+    identifier: str, limit: int, chunk_key: str, identifier_role: str
+) -> str:
+    """Truncate entity identifiers that exceed the configured length limit."""
+
+    if len(identifier) <= limit:
+        return identifier
+
+    display_value = identifier[:limit]
+    preview = identifier[:20]  # Show first 20 characters as preview
+    logger.warning(
+        "%s: %s exceeded %d characters (len: %d, preview: '%s...'",
+        chunk_key,
+        identifier_role,
+        limit,
+        len(identifier),
+        preview,
+    )
+    return display_value
+
+
 def chunking_by_token_size(
    tokenizer: Tokenizer,
    content: str,
@ -952,7 +974,14 @@ async def _process_extraction_result(
            record_attributes, chunk_key, timestamp, file_path
        )
        if entity_data is not None:
-            maybe_nodes[entity_data["entity_name"]].append(entity_data)
+            truncated_name = _truncate_entity_identifier(
+                entity_data["entity_name"],
+                DEFAULT_ENTITY_NAME_MAX_LENGTH,
+                chunk_key,
+                "Entity name",
+            )
+            entity_data["entity_name"] = truncated_name
+            maybe_nodes[truncated_name].append(entity_data)
            continue

        # Try to parse as relationship
@ -960,9 +989,21 @@ async def _process_extraction_result(
            record_attributes, chunk_key, timestamp, file_path
        )
        if relationship_data is not None:
-            maybe_edges[
-                (relationship_data["src_id"], relationship_data["tgt_id"])
-            ].append(relationship_data)
+            truncated_source = _truncate_entity_identifier(
+                relationship_data["src_id"],
+                DEFAULT_ENTITY_NAME_MAX_LENGTH,
+                chunk_key,
+                "Relationship source entity",
+            )
+            truncated_target = _truncate_entity_identifier(
+                relationship_data["tgt_id"],
+                DEFAULT_ENTITY_NAME_MAX_LENGTH,
+                chunk_key,
+                "Relationship target entity",
+            )
+            relationship_data["src_id"] = truncated_source
+            relationship_data["tgt_id"] = truncated_target
+            maybe_edges[(truncated_source, truncated_target)].append(relationship_data)

    return dict(maybe_nodes), dict(maybe_edges)