Add entity/relation chunk tracking with configurable source ID limits

- Add entity_chunks & relation_chunks storage
- Implement KEEP/FIFO limit strategies
- Update env.example with new settings
- Add migration for chunk tracking data
- Support all KV storage

(cherry picked from commit dc62c78f98)
This commit is contained in:
yangdx 2025-10-20 15:24:15 +08:00 committed by Raphaël MANSUY
parent 7248e09fc4
commit cb5451faf8
10 changed files with 647 additions and 1615 deletions

View file

@ -29,7 +29,7 @@ WEBUI_DESCRIPTION="Simple and Fast Graph Based RAG System"
# OLLAMA_EMULATING_MODEL_NAME=lightrag # OLLAMA_EMULATING_MODEL_NAME=lightrag
OLLAMA_EMULATING_MODEL_TAG=latest OLLAMA_EMULATING_MODEL_TAG=latest
### Max nodes for graph retrieval (Ensure WebUI local settings are also updated, which is limited to this value) ### Max nodes return from graph retrieval in webui
# MAX_GRAPH_NODES=1000 # MAX_GRAPH_NODES=1000
### Logging level ### Logging level
@ -50,8 +50,6 @@ OLLAMA_EMULATING_MODEL_TAG=latest
# JWT_ALGORITHM=HS256 # JWT_ALGORITHM=HS256
### API-Key to access LightRAG Server API ### API-Key to access LightRAG Server API
### Use this key in HTTP requests with the 'X-API-Key' header
### Example: curl -H "X-API-Key: your-secure-api-key-here" http://localhost:9621/query
# LIGHTRAG_API_KEY=your-secure-api-key-here # LIGHTRAG_API_KEY=your-secure-api-key-here
# WHITELIST_PATHS=/health,/api/* # WHITELIST_PATHS=/health,/api/*
@ -75,6 +73,16 @@ ENABLE_LLM_CACHE=true
# MAX_RELATION_TOKENS=8000 # MAX_RELATION_TOKENS=8000
### control the maximum tokens send to LLM (include entities, relations and chunks) ### control the maximum tokens send to LLM (include entities, relations and chunks)
# MAX_TOTAL_TOKENS=30000 # MAX_TOTAL_TOKENS=30000
### control the maximum chunk_ids stored in vector and graph db
# MAX_SOURCE_IDS_PER_ENTITY=300
# MAX_SOURCE_IDS_PER_RELATION=300
### control chunk_ids limitation method: KEEP, FIFO (KEPP: Ingore New Chunks, FIFO: New chunks replace old chunks)
# SOURCE_IDS_LIMIT_METHOD=KEEP
### maximum number of related chunks per source entity or relation
### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
### Higher values increase re-ranking time
# RELATED_CHUNK_NUMBER=5
### chunk selection strategies ### chunk selection strategies
### VECTOR: Pick KG chunks by vector similarity, delivered chunks to the LLM aligning more closely with naive retrieval ### VECTOR: Pick KG chunks by vector similarity, delivered chunks to the LLM aligning more closely with naive retrieval
@ -121,9 +129,6 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true
### Document processing output language: English, Chinese, French, German ... ### Document processing output language: English, Chinese, French, German ...
SUMMARY_LANGUAGE=English SUMMARY_LANGUAGE=English
### PDF decryption password for protected PDF files
# PDF_DECRYPT_PASSWORD=your_pdf_password_here
### Entity types that the LLM will attempt to recognize ### Entity types that the LLM will attempt to recognize
# ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]' # ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'
@ -140,22 +145,6 @@ SUMMARY_LANGUAGE=English
### Maximum context size sent to LLM for description summary ### Maximum context size sent to LLM for description summary
# SUMMARY_CONTEXT_SIZE=12000 # SUMMARY_CONTEXT_SIZE=12000
### control the maximum chunk_ids stored in vector and graph db
# MAX_SOURCE_IDS_PER_ENTITY=300
# MAX_SOURCE_IDS_PER_RELATION=300
### control chunk_ids limitation method: FIFO, KEEP
### FIFO: First in first out
### KEEP: Keep oldest (less merge action and faster)
# SOURCE_IDS_LIMIT_METHOD=FIFO
# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
# MAX_FILE_PATHS=100
### maximum number of related chunks per source entity or relation
### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
### Higher values increase re-ranking time
# RELATED_CHUNK_NUMBER=5
############################### ###############################
### Concurrency Configuration ### Concurrency Configuration
############################### ###############################
@ -168,13 +157,10 @@ MAX_PARALLEL_INSERT=2
### Num of chunks send to Embedding in single request ### Num of chunks send to Embedding in single request
# EMBEDDING_BATCH_NUM=10 # EMBEDDING_BATCH_NUM=10
########################################################################### ###########################################################
### LLM Configuration ### LLM Configuration
### LLM_BINDING type: openai, ollama, lollms, azure_openai, aws_bedrock, gemini ### LLM_BINDING type: openai, ollama, lollms, azure_openai, aws_bedrock
### LLM_BINDING_HOST: host only for Ollama, endpoint for other LLM service ###########################################################
### If LightRAG deployed in Docker:
### uses host.docker.internal instead of localhost in LLM_BINDING_HOST
###########################################################################
### LLM request timeout setting for all llm (0 means no timeout for Ollma) ### LLM request timeout setting for all llm (0 means no timeout for Ollma)
# LLM_TIMEOUT=180 # LLM_TIMEOUT=180
@ -183,7 +169,7 @@ LLM_MODEL=gpt-4o
LLM_BINDING_HOST=https://api.openai.com/v1 LLM_BINDING_HOST=https://api.openai.com/v1
LLM_BINDING_API_KEY=your_api_key LLM_BINDING_API_KEY=your_api_key
### Env vars for Azure openai ### Optional for Azure
# AZURE_OPENAI_API_VERSION=2024-08-01-preview # AZURE_OPENAI_API_VERSION=2024-08-01-preview
# AZURE_OPENAI_DEPLOYMENT=gpt-4o # AZURE_OPENAI_DEPLOYMENT=gpt-4o
@ -193,21 +179,18 @@ LLM_BINDING_API_KEY=your_api_key
# LLM_BINDING_API_KEY=your_api_key # LLM_BINDING_API_KEY=your_api_key
# LLM_BINDING=openai # LLM_BINDING=openai
### Gemini example ### OpenAI Compatible API Specific Parameters
# LLM_BINDING=gemini ### Increased temperature values may mitigate infinite inference loops in certain LLM, such as Qwen3-30B.
# LLM_MODEL=gemini-flash-latest # OPENAI_LLM_TEMPERATURE=0.9
# LLM_BINDING_API_KEY=your_gemini_api_key ### Set the max_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s)
# LLM_BINDING_HOST=https://generativelanguage.googleapis.com ### Typically, max_tokens does not include prompt content, though some models, such as Gemini Models, are exceptions
### For vLLM/SGLang deployed models, or most of OpenAI compatible API provider
# OPENAI_LLM_MAX_TOKENS=9000
### For OpenAI o1-mini or newer modles
OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
### use the following command to see all support options for OpenAI, azure_openai or OpenRouter #### OpenAI's new API utilizes max_completion_tokens instead of max_tokens
### lightrag-server --llm-binding gemini --help # OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
### Gemini Specific Parameters
# GEMINI_LLM_MAX_OUTPUT_TOKENS=9000
# GEMINI_LLM_TEMPERATURE=0.7
### Enable Thinking
# GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": -1, "include_thoughts": true}'
### Disable Thinking
# GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
### use the following command to see all support options for OpenAI, azure_openai or OpenRouter ### use the following command to see all support options for OpenAI, azure_openai or OpenRouter
### lightrag-server --llm-binding openai --help ### lightrag-server --llm-binding openai --help
@ -218,16 +201,6 @@ LLM_BINDING_API_KEY=your_api_key
### Qwen3 Specific Parameters deploy by vLLM ### Qwen3 Specific Parameters deploy by vLLM
# OPENAI_LLM_EXTRA_BODY='{"chat_template_kwargs": {"enable_thinking": false}}' # OPENAI_LLM_EXTRA_BODY='{"chat_template_kwargs": {"enable_thinking": false}}'
### OpenAI Compatible API Specific Parameters
### Increased temperature values may mitigate infinite inference loops in certain LLM, such as Qwen3-30B.
# OPENAI_LLM_TEMPERATURE=0.9
### Set the max_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s)
### Typically, max_tokens does not include prompt content
### For vLLM/SGLang deployed models, or most of OpenAI compatible API provider
# OPENAI_LLM_MAX_TOKENS=9000
### For OpenAI o1-mini or newer modles utilizes max_completion_tokens instead of max_tokens
OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
### use the following command to see all support options for Ollama LLM ### use the following command to see all support options for Ollama LLM
### lightrag-server --llm-binding ollama --help ### lightrag-server --llm-binding ollama --help
### Ollama Server Specific Parameters ### Ollama Server Specific Parameters
@ -241,37 +214,24 @@ OLLAMA_LLM_NUM_CTX=32768
### Bedrock Specific Parameters ### Bedrock Specific Parameters
# BEDROCK_LLM_TEMPERATURE=1.0 # BEDROCK_LLM_TEMPERATURE=1.0
####################################################################################### ####################################################################################
### Embedding Configuration (Should not be changed after the first file processed) ### Embedding Configuration (Should not be changed after the first file processed)
### EMBEDDING_BINDING: ollama, openai, azure_openai, jina, lollms, aws_bedrock ### EMBEDDING_BINDING: ollama, openai, azure_openai, jina, lollms, aws_bedrock
### EMBEDDING_BINDING_HOST: host only for Ollama, endpoint for other Embedding service ####################################################################################
### If LightRAG deployed in Docker:
### uses host.docker.internal instead of localhost in EMBEDDING_BINDING_HOST
#######################################################################################
# EMBEDDING_TIMEOUT=30 # EMBEDDING_TIMEOUT=30
EMBEDDING_BINDING=ollama
### Control whether to send embedding_dim parameter to embedding API EMBEDDING_MODEL=bge-m3:latest
### IMPORTANT: Jina ALWAYS sends dimension parameter (API requirement) - this setting is ignored for Jina EMBEDDING_DIM=1024
### For OpenAI: Set to 'true' to enable dynamic dimension adjustment
### For OpenAI: Set to 'false' (default) to disable sending dimension parameter
### Note: Automatically ignored for backends that don't support dimension parameter (e.g., Ollama)
# Ollama embedding
# EMBEDDING_BINDING=ollama
# EMBEDDING_MODEL=bge-m3:latest
# EMBEDDING_DIM=1024
# EMBEDDING_BINDING_API_KEY=your_api_key
### If LightRAG deployed in Docker uses host.docker.internal instead of localhost
# EMBEDDING_BINDING_HOST=http://localhost:11434
### OpenAI compatible embedding
EMBEDDING_BINDING=openai
EMBEDDING_MODEL=text-embedding-3-large
EMBEDDING_DIM=3072
EMBEDDING_SEND_DIM=false
EMBEDDING_TOKEN_LIMIT=8192
EMBEDDING_BINDING_HOST=https://api.openai.com/v1
EMBEDDING_BINDING_API_KEY=your_api_key EMBEDDING_BINDING_API_KEY=your_api_key
# If the embedding service is deployed within the same Docker stack, use host.docker.internal instead of localhost
EMBEDDING_BINDING_HOST=http://localhost:11434
### OpenAI compatible (VoyageAI embedding openai compatible)
# EMBEDDING_BINDING=openai
# EMBEDDING_MODEL=text-embedding-3-large
# EMBEDDING_DIM=3072
# EMBEDDING_BINDING_HOST=https://api.openai.com/v1
# EMBEDDING_BINDING_API_KEY=your_api_key
### Optional for Azure ### Optional for Azure
# AZURE_EMBEDDING_DEPLOYMENT=text-embedding-3-large # AZURE_EMBEDDING_DEPLOYMENT=text-embedding-3-large
@ -279,16 +239,6 @@ EMBEDDING_BINDING_API_KEY=your_api_key
# AZURE_EMBEDDING_ENDPOINT=your_endpoint # AZURE_EMBEDDING_ENDPOINT=your_endpoint
# AZURE_EMBEDDING_API_KEY=your_api_key # AZURE_EMBEDDING_API_KEY=your_api_key
### Gemini embedding
# EMBEDDING_BINDING=gemini
# EMBEDDING_MODEL=gemini-embedding-001
# EMBEDDING_DIM=1536
# EMBEDDING_TOKEN_LIMIT=2048
# EMBEDDING_BINDING_HOST=https://generativelanguage.googleapis.com
# EMBEDDING_BINDING_API_KEY=your_api_key
### Gemini embedding requires sending dimension to server
# EMBEDDING_SEND_DIM=true
### Jina AI Embedding ### Jina AI Embedding
# EMBEDDING_BINDING=jina # EMBEDDING_BINDING=jina
# EMBEDDING_BINDING_HOST=https://api.jina.ai/v1/embeddings # EMBEDDING_BINDING_HOST=https://api.jina.ai/v1/embeddings
@ -349,8 +299,7 @@ POSTGRES_USER=your_username
POSTGRES_PASSWORD='your_password' POSTGRES_PASSWORD='your_password'
POSTGRES_DATABASE=your_database POSTGRES_DATABASE=your_database
POSTGRES_MAX_CONNECTIONS=12 POSTGRES_MAX_CONNECTIONS=12
### DB specific workspace should not be set, keep for compatible only # POSTGRES_WORKSPACE=forced_workspace_name
### POSTGRES_WORKSPACE=forced_workspace_name
### PostgreSQL Vector Storage Configuration ### PostgreSQL Vector Storage Configuration
### Vector storage type: HNSW, IVFFlat ### Vector storage type: HNSW, IVFFlat
@ -396,8 +345,7 @@ NEO4J_MAX_TRANSACTION_RETRY_TIME=30
NEO4J_MAX_CONNECTION_LIFETIME=300 NEO4J_MAX_CONNECTION_LIFETIME=300
NEO4J_LIVENESS_CHECK_TIMEOUT=30 NEO4J_LIVENESS_CHECK_TIMEOUT=30
NEO4J_KEEP_ALIVE=true NEO4J_KEEP_ALIVE=true
### DB specific workspace should not be set, keep for compatible only # NEO4J_WORKSPACE=forced_workspace_name
### NEO4J_WORKSPACE=forced_workspace_name
### MongoDB Configuration ### MongoDB Configuration
MONGO_URI=mongodb://root:root@localhost:27017/ MONGO_URI=mongodb://root:root@localhost:27017/
@ -411,14 +359,12 @@ MILVUS_DB_NAME=lightrag
# MILVUS_USER=root # MILVUS_USER=root
# MILVUS_PASSWORD=your_password # MILVUS_PASSWORD=your_password
# MILVUS_TOKEN=your_token # MILVUS_TOKEN=your_token
### DB specific workspace should not be set, keep for compatible only # MILVUS_WORKSPACE=forced_workspace_name
### MILVUS_WORKSPACE=forced_workspace_name
### Qdrant ### Qdrant
QDRANT_URL=http://localhost:6333 QDRANT_URL=http://localhost:6333
# QDRANT_API_KEY=your-api-key # QDRANT_API_KEY=your-api-key
### DB specific workspace should not be set, keep for compatible only # QDRANT_WORKSPACE=forced_workspace_name
### QDRANT_WORKSPACE=forced_workspace_name
### Redis ### Redis
REDIS_URI=redis://localhost:6379 REDIS_URI=redis://localhost:6379
@ -426,45 +372,11 @@ REDIS_SOCKET_TIMEOUT=30
REDIS_CONNECT_TIMEOUT=10 REDIS_CONNECT_TIMEOUT=10
REDIS_MAX_CONNECTIONS=100 REDIS_MAX_CONNECTIONS=100
REDIS_RETRY_ATTEMPTS=3 REDIS_RETRY_ATTEMPTS=3
### DB specific workspace should not be set, keep for compatible only # REDIS_WORKSPACE=forced_workspace_name
### REDIS_WORKSPACE=forced_workspace_name
### Memgraph Configuration ### Memgraph Configuration
MEMGRAPH_URI=bolt://localhost:7687 MEMGRAPH_URI=bolt://localhost:7687
MEMGRAPH_USERNAME= MEMGRAPH_USERNAME=
MEMGRAPH_PASSWORD= MEMGRAPH_PASSWORD=
MEMGRAPH_DATABASE=memgraph MEMGRAPH_DATABASE=memgraph
### DB specific workspace should not be set, keep for compatible only # MEMGRAPH_WORKSPACE=forced_workspace_name
### MEMGRAPH_WORKSPACE=forced_workspace_name
############################
### Evaluation Configuration
############################
### RAGAS evaluation models (used for RAG quality assessment)
### ⚠️ IMPORTANT: Both LLM and Embedding endpoints MUST be OpenAI-compatible
### Default uses OpenAI models for evaluation
### LLM Configuration for Evaluation
# EVAL_LLM_MODEL=gpt-4o-mini
### API key for LLM evaluation (fallback to OPENAI_API_KEY if not set)
# EVAL_LLM_BINDING_API_KEY=your_api_key
### Custom OpenAI-compatible endpoint for LLM evaluation (optional)
# EVAL_LLM_BINDING_HOST=https://api.openai.com/v1
### Embedding Configuration for Evaluation
# EVAL_EMBEDDING_MODEL=text-embedding-3-large
### API key for embeddings (fallback: EVAL_LLM_BINDING_API_KEY -> OPENAI_API_KEY)
# EVAL_EMBEDDING_BINDING_API_KEY=your_embedding_api_key
### Custom OpenAI-compatible endpoint for embeddings (fallback: EVAL_LLM_BINDING_HOST)
# EVAL_EMBEDDING_BINDING_HOST=https://api.openai.com/v1
### Performance Tuning
### Number of concurrent test case evaluations
### Lower values reduce API rate limit issues but increase evaluation time
# EVAL_MAX_CONCURRENT=2
### TOP_K query parameter of LightRAG (default: 10)
### Number of entities or relations retrieved from KG
# EVAL_QUERY_TOP_K=10
### LLM request retry and timeout settings for evaluation
# EVAL_LLM_MAX_RETRIES=5
# EVAL_LLM_TIMEOUT=180

View file

@ -378,6 +378,14 @@ class BaseKVStorage(StorageNameSpace, ABC):
None None
""" """
@abstractmethod
async def is_empty(self) -> bool:
"""Check if the storage is empty
Returns:
bool: True if storage contains no data, False otherwise
"""
@dataclass @dataclass
class BaseGraphStorage(StorageNameSpace, ABC): class BaseGraphStorage(StorageNameSpace, ABC):

View file

@ -13,7 +13,16 @@ DEFAULT_MAX_GRAPH_NODES = 1000
# Default values for extraction settings # Default values for extraction settings
DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing
DEFAULT_MAX_GLEANING = 1 DEFAULT_MAX_GLEANING = 1
DEFAULT_ENTITY_NAME_MAX_LENGTH = 256
DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3
DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3
SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP
VALID_SOURCE_IDS_LIMIT_METHODS = {
SOURCE_IDS_LIMIT_METHOD_KEEP,
SOURCE_IDS_LIMIT_METHOD_FIFO,
}
# Number of description fragments to trigger LLM summary # Number of description fragments to trigger LLM summary
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8 DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
@ -38,7 +47,7 @@ DEFAULT_ENTITY_TYPES = [
"NaturalObject", "NaturalObject",
] ]
# Separator for: description, source_id and relation-key fields(Can not be changed after data inserted) # Separator for graph fields
GRAPH_FIELD_SEP = "<SEP>" GRAPH_FIELD_SEP = "<SEP>"
# Query and retrieval configuration defaults # Query and retrieval configuration defaults
@ -58,27 +67,8 @@ DEFAULT_HISTORY_TURNS = 0
DEFAULT_MIN_RERANK_SCORE = 0.0 DEFAULT_MIN_RERANK_SCORE = 0.0
DEFAULT_RERANK_BINDING = "null" DEFAULT_RERANK_BINDING = "null"
# Default source ids limit in meta data for entity and relation # File path configuration for vector and graph database(Should not be changed, used in Milvus Schema)
DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300
DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300
### control chunk_ids limitation method: FIFO, FIFO
### FIFO: First in first out
### KEEP: Keep oldest (less merge action and faster)
SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_FIFO
VALID_SOURCE_IDS_LIMIT_METHODS = {
SOURCE_IDS_LIMIT_METHOD_KEEP,
SOURCE_IDS_LIMIT_METHOD_FIFO,
}
# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
DEFAULT_MAX_FILE_PATHS = 100
# Field length of file_path in Milvus Schema for entity and relation (Should not be changed)
# file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata.
DEFAULT_MAX_FILE_PATH_LENGTH = 32768 DEFAULT_MAX_FILE_PATH_LENGTH = 32768
# Placeholder for more file paths in meta data for entity and relation (Should not be changed)
DEFAULT_FILE_PATH_MORE_PLACEHOLDER = "truncated"
# Default temperature for LLM # Default temperature for LLM
DEFAULT_TEMPERATURE = 1.0 DEFAULT_TEMPERATURE = 1.0

View file

@ -13,7 +13,7 @@ from lightrag.utils import (
from lightrag.exceptions import StorageNotInitializedError from lightrag.exceptions import StorageNotInitializedError
from .shared_storage import ( from .shared_storage import (
get_namespace_data, get_namespace_data,
get_namespace_lock, get_storage_lock,
get_data_init_lock, get_data_init_lock,
get_update_flag, get_update_flag,
set_all_update_flags, set_all_update_flags,
@ -46,20 +46,12 @@ class JsonKVStorage(BaseKVStorage):
async def initialize(self): async def initialize(self):
"""Initialize storage data""" """Initialize storage data"""
self._storage_lock = get_namespace_lock( self._storage_lock = get_storage_lock()
self.final_namespace, workspace=self.workspace self.storage_updated = await get_update_flag(self.final_namespace)
)
self.storage_updated = await get_update_flag(
self.final_namespace, workspace=self.workspace
)
async with get_data_init_lock(): async with get_data_init_lock():
# check need_init must before get_namespace_data # check need_init must before get_namespace_data
need_init = await try_initialize_namespace( need_init = await try_initialize_namespace(self.final_namespace)
self.final_namespace, workspace=self.workspace self._data = await get_namespace_data(self.final_namespace)
)
self._data = await get_namespace_data(
self.final_namespace, workspace=self.workspace
)
if need_init: if need_init:
loaded_data = load_json(self._file_name) or {} loaded_data = load_json(self._file_name) or {}
async with self._storage_lock: async with self._storage_lock:
@ -89,23 +81,8 @@ class JsonKVStorage(BaseKVStorage):
logger.debug( logger.debug(
f"[{self.workspace}] Process {os.getpid()} KV writting {data_count} records to {self.namespace}" f"[{self.workspace}] Process {os.getpid()} KV writting {data_count} records to {self.namespace}"
) )
write_json(data_dict, self._file_name)
# Write JSON and check if sanitization was applied await clear_all_update_flags(self.final_namespace)
needs_reload = write_json(data_dict, self._file_name)
# If data was sanitized, reload cleaned data to update shared memory
if needs_reload:
logger.info(
f"[{self.workspace}] Reloading sanitized data into shared memory for {self.namespace}"
)
cleaned_data = load_json(self._file_name)
if cleaned_data is not None:
self._data.clear()
self._data.update(cleaned_data)
await clear_all_update_flags(
self.final_namespace, workspace=self.workspace
)
async def get_by_id(self, id: str) -> dict[str, Any] | None: async def get_by_id(self, id: str) -> dict[str, Any] | None:
async with self._storage_lock: async with self._storage_lock:
@ -178,7 +155,7 @@ class JsonKVStorage(BaseKVStorage):
v["_id"] = k v["_id"] = k
self._data.update(data) self._data.update(data)
await set_all_update_flags(self.final_namespace, workspace=self.workspace) await set_all_update_flags(self.final_namespace)
async def delete(self, ids: list[str]) -> None: async def delete(self, ids: list[str]) -> None:
"""Delete specific records from storage by their IDs """Delete specific records from storage by their IDs
@ -201,9 +178,7 @@ class JsonKVStorage(BaseKVStorage):
any_deleted = True any_deleted = True
if any_deleted: if any_deleted:
await set_all_update_flags( await set_all_update_flags(self.final_namespace)
self.final_namespace, workspace=self.workspace
)
async def is_empty(self) -> bool: async def is_empty(self) -> bool:
"""Check if the storage is empty """Check if the storage is empty
@ -231,9 +206,7 @@ class JsonKVStorage(BaseKVStorage):
try: try:
async with self._storage_lock: async with self._storage_lock:
self._data.clear() self._data.clear()
await set_all_update_flags( await set_all_update_flags(self.final_namespace)
self.final_namespace, workspace=self.workspace
)
await self.index_done_callback() await self.index_done_callback()
logger.info( logger.info(
@ -251,7 +224,7 @@ class JsonKVStorage(BaseKVStorage):
data: Original data dictionary that may contain legacy structure data: Original data dictionary that may contain legacy structure
Returns: Returns:
Migrated data dictionary with flattened cache keys (sanitized if needed) Migrated data dictionary with flattened cache keys
""" """
from lightrag.utils import generate_cache_key from lightrag.utils import generate_cache_key
@ -288,17 +261,8 @@ class JsonKVStorage(BaseKVStorage):
logger.info( logger.info(
f"[{self.workspace}] Migrated {migration_count} legacy cache entries to flattened structure" f"[{self.workspace}] Migrated {migration_count} legacy cache entries to flattened structure"
) )
# Persist migrated data immediately and check if sanitization was applied # Persist migrated data immediately
needs_reload = write_json(migrated_data, self._file_name) write_json(migrated_data, self._file_name)
# If data was sanitized during write, reload cleaned data
if needs_reload:
logger.info(
f"[{self.workspace}] Reloading sanitized migration data for {self.namespace}"
)
cleaned_data = load_json(self._file_name)
if cleaned_data is not None:
return cleaned_data # Return cleaned data to update shared memory
return migrated_data return migrated_data

View file

@ -174,22 +174,6 @@ class MongoKVStorage(BaseKVStorage):
existing_ids = {str(x["_id"]) async for x in cursor} existing_ids = {str(x["_id"]) async for x in cursor}
return keys - existing_ids return keys - existing_ids
async def get_all(self) -> dict[str, Any]:
"""Get all data from storage
Returns:
Dictionary containing all stored data
"""
cursor = self._data.find({})
result = {}
async for doc in cursor:
doc_id = doc.pop("_id")
# Ensure time fields are present for all documents
doc.setdefault("create_time", 0)
doc.setdefault("update_time", 0)
result[doc_id] = doc
return result
async def upsert(self, data: dict[str, dict[str, Any]]) -> None: async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
logger.debug(f"[{self.workspace}] Inserting {len(data)} to {self.namespace}") logger.debug(f"[{self.workspace}] Inserting {len(data)} to {self.namespace}")
if not data: if not data:
@ -235,6 +219,20 @@ class MongoKVStorage(BaseKVStorage):
# Mongo handles persistence automatically # Mongo handles persistence automatically
pass pass
async def is_empty(self) -> bool:
"""Check if the storage is empty for the current workspace and namespace
Returns:
bool: True if storage is empty, False otherwise
"""
try:
# Use count_documents with limit 1 for efficiency
count = await self._data.count_documents({}, limit=1)
return count == 0
except PyMongoError as e:
logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
return True
async def delete(self, ids: list[str]) -> None: async def delete(self, ids: list[str]) -> None:
"""Delete documents with specified IDs """Delete documents with specified IDs
@ -463,6 +461,20 @@ class MongoDocStatusStorage(DocStatusStorage):
# Mongo handles persistence automatically # Mongo handles persistence automatically
pass pass
async def is_empty(self) -> bool:
"""Check if the storage is empty for the current workspace and namespace
Returns:
bool: True if storage is empty, False otherwise
"""
try:
# Use count_documents with limit 1 for efficiency
count = await self._data.count_documents({}, limit=1)
return count == 0
except PyMongoError as e:
logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
return True
async def drop(self) -> dict[str, str]: async def drop(self) -> dict[str, str]:
"""Drop the storage by removing all documents in the collection. """Drop the storage by removing all documents in the collection.

View file

@ -21,7 +21,7 @@ from lightrag.base import (
DocStatus, DocStatus,
DocProcessingStatus, DocProcessingStatus,
) )
from ..kg.shared_storage import get_data_init_lock from ..kg.shared_storage import get_data_init_lock, get_storage_lock
import json import json
# Import tenacity for retry logic # Import tenacity for retry logic
@ -153,7 +153,7 @@ class RedisKVStorage(BaseKVStorage):
else: else:
# When workspace is empty, final_namespace equals original namespace # When workspace is empty, final_namespace equals original namespace
self.final_namespace = self.namespace self.final_namespace = self.namespace
self.workspace = "" self.workspace = "_"
logger.debug(f"Final namespace (no workspace): '{self.final_namespace}'") logger.debug(f"Final namespace (no workspace): '{self.final_namespace}'")
self._redis_url = os.environ.get( self._redis_url = os.environ.get(
@ -368,13 +368,12 @@ class RedisKVStorage(BaseKVStorage):
Returns: Returns:
bool: True if storage is empty, False otherwise bool: True if storage is empty, False otherwise
""" """
pattern = f"{self.final_namespace}:*" pattern = f"{self.namespace}:{self.workspace}:*"
try: try:
async with self._get_redis_connection() as redis: # Use scan to check if any keys exist
# Use scan to check if any keys exist async for key in self.redis.scan_iter(match=pattern, count=1):
async for key in redis.scan_iter(match=pattern, count=1): return False # Found at least one key
return False # Found at least one key return True # No keys found
return True # No keys found
except Exception as e: except Exception as e:
logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}") logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
return True return True
@ -401,39 +400,42 @@ class RedisKVStorage(BaseKVStorage):
Returns: Returns:
dict[str, str]: Status of the operation with keys 'status' and 'message' dict[str, str]: Status of the operation with keys 'status' and 'message'
""" """
async with self._get_redis_connection() as redis: async with get_storage_lock():
try: async with self._get_redis_connection() as redis:
# Use SCAN to find all keys with the namespace prefix try:
pattern = f"{self.final_namespace}:*" # Use SCAN to find all keys with the namespace prefix
cursor = 0 pattern = f"{self.final_namespace}:*"
deleted_count = 0 cursor = 0
deleted_count = 0
while True: while True:
cursor, keys = await redis.scan(cursor, match=pattern, count=1000) cursor, keys = await redis.scan(
if keys: cursor, match=pattern, count=1000
# Delete keys in batches )
pipe = redis.pipeline() if keys:
for key in keys: # Delete keys in batches
pipe.delete(key) pipe = redis.pipeline()
results = await pipe.execute() for key in keys:
deleted_count += sum(results) pipe.delete(key)
results = await pipe.execute()
deleted_count += sum(results)
if cursor == 0: if cursor == 0:
break break
logger.info( logger.info(
f"[{self.workspace}] Dropped {deleted_count} keys from {self.namespace}" f"[{self.workspace}] Dropped {deleted_count} keys from {self.namespace}"
) )
return { return {
"status": "success", "status": "success",
"message": f"{deleted_count} keys dropped", "message": f"{deleted_count} keys dropped",
} }
except Exception as e: except Exception as e:
logger.error( logger.error(
f"[{self.workspace}] Error dropping keys from {self.namespace}: {e}" f"[{self.workspace}] Error dropping keys from {self.namespace}: {e}"
) )
return {"status": "error", "message": str(e)} return {"status": "error", "message": str(e)}
async def _migrate_legacy_cache_structure(self): async def _migrate_legacy_cache_structure(self):
"""Migrate legacy nested cache structure to flattened structure for Redis """Migrate legacy nested cache structure to flattened structure for Redis
@ -1088,32 +1090,35 @@ class RedisDocStatusStorage(DocStatusStorage):
async def drop(self) -> dict[str, str]: async def drop(self) -> dict[str, str]:
"""Drop all document status data from storage and clean up resources""" """Drop all document status data from storage and clean up resources"""
try: async with get_storage_lock():
async with self._get_redis_connection() as redis: try:
# Use SCAN to find all keys with the namespace prefix async with self._get_redis_connection() as redis:
pattern = f"{self.final_namespace}:*" # Use SCAN to find all keys with the namespace prefix
cursor = 0 pattern = f"{self.final_namespace}:*"
deleted_count = 0 cursor = 0
deleted_count = 0
while True: while True:
cursor, keys = await redis.scan(cursor, match=pattern, count=1000) cursor, keys = await redis.scan(
if keys: cursor, match=pattern, count=1000
# Delete keys in batches )
pipe = redis.pipeline() if keys:
for key in keys: # Delete keys in batches
pipe.delete(key) pipe = redis.pipeline()
results = await pipe.execute() for key in keys:
deleted_count += sum(results) pipe.delete(key)
results = await pipe.execute()
deleted_count += sum(results)
if cursor == 0: if cursor == 0:
break break
logger.info( logger.info(
f"[{self.workspace}] Dropped {deleted_count} doc status keys from {self.namespace}" f"[{self.workspace}] Dropped {deleted_count} doc status keys from {self.namespace}"
)
return {"status": "success", "message": "data dropped"}
except Exception as e:
logger.error(
f"[{self.workspace}] Error dropping doc status {self.namespace}: {e}"
) )
return {"status": "success", "message": "data dropped"} return {"status": "error", "message": str(e)}
except Exception as e:
logger.error(
f"[{self.workspace}] Error dropping doc status {self.namespace}: {e}"
)
return {"status": "error", "message": str(e)}

View file

@ -3,7 +3,6 @@ from __future__ import annotations
import traceback import traceback
import asyncio import asyncio
import configparser import configparser
import inspect
import os import os
import time import time
import warnings import warnings
@ -13,7 +12,6 @@ from functools import partial
from typing import ( from typing import (
Any, Any,
AsyncIterator, AsyncIterator,
Awaitable,
Callable, Callable,
Iterator, Iterator,
cast, cast,
@ -22,10 +20,8 @@ from typing import (
Optional, Optional,
List, List,
Dict, Dict,
Union,
) )
from lightrag.prompt import PROMPTS from lightrag.prompt import PROMPTS
from lightrag.exceptions import PipelineCancelledException
from lightrag.constants import ( from lightrag.constants import (
DEFAULT_MAX_GLEANING, DEFAULT_MAX_GLEANING,
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
@ -51,8 +47,6 @@ from lightrag.constants import (
DEFAULT_LLM_TIMEOUT, DEFAULT_LLM_TIMEOUT,
DEFAULT_EMBEDDING_TIMEOUT, DEFAULT_EMBEDDING_TIMEOUT,
DEFAULT_SOURCE_IDS_LIMIT_METHOD, DEFAULT_SOURCE_IDS_LIMIT_METHOD,
DEFAULT_MAX_FILE_PATHS,
DEFAULT_FILE_PATH_MORE_PLACEHOLDER,
) )
from lightrag.utils import get_env_value from lightrag.utils import get_env_value
@ -64,10 +58,9 @@ from lightrag.kg import (
from lightrag.kg.shared_storage import ( from lightrag.kg.shared_storage import (
get_namespace_data, get_namespace_data,
get_pipeline_status_lock,
get_graph_db_lock,
get_data_init_lock, get_data_init_lock,
get_default_workspace,
set_default_workspace,
get_namespace_lock,
) )
from lightrag.base import ( from lightrag.base import (
@ -91,7 +84,7 @@ from lightrag.operate import (
merge_nodes_and_edges, merge_nodes_and_edges,
kg_query, kg_query,
naive_query, naive_query,
rebuild_knowledge_from_chunks, _rebuild_knowledge_from_chunks,
) )
from lightrag.constants import GRAPH_FIELD_SEP from lightrag.constants import GRAPH_FIELD_SEP
from lightrag.utils import ( from lightrag.utils import (
@ -247,13 +240,11 @@ class LightRAG:
int, int,
int, int,
], ],
Union[List[Dict[str, Any]], Awaitable[List[Dict[str, Any]]]], List[Dict[str, Any]],
] = field(default_factory=lambda: chunking_by_token_size) ] = field(default_factory=lambda: chunking_by_token_size)
""" """
Custom chunking function for splitting text into chunks before processing. Custom chunking function for splitting text into chunks before processing.
The function can be either synchronous or asynchronous.
The function should take the following parameters: The function should take the following parameters:
- `tokenizer`: A Tokenizer instance to use for tokenization. - `tokenizer`: A Tokenizer instance to use for tokenization.
@ -263,8 +254,7 @@ class LightRAG:
- `chunk_token_size`: The maximum number of tokens per chunk. - `chunk_token_size`: The maximum number of tokens per chunk.
- `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks. - `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
The function should return a list of dictionaries (or an awaitable that resolves to a list), The function should return a list of dictionaries, where each dictionary contains the following keys:
where each dictionary contains the following keys:
- `tokens`: The number of tokens in the chunk. - `tokens`: The number of tokens in the chunk.
- `content`: The text content of the chunk. - `content`: The text content of the chunk.
@ -277,9 +267,6 @@ class LightRAG:
embedding_func: EmbeddingFunc | None = field(default=None) embedding_func: EmbeddingFunc | None = field(default=None)
"""Function for computing text embeddings. Must be set before use.""" """Function for computing text embeddings. Must be set before use."""
embedding_token_limit: int | None = field(default=None, init=False)
"""Token limit for embedding model. Set automatically from embedding_func.max_token_size in __post_init__."""
embedding_batch_num: int = field(default=int(os.getenv("EMBEDDING_BATCH_NUM", 10))) embedding_batch_num: int = field(default=int(os.getenv("EMBEDDING_BATCH_NUM", 10)))
"""Batch size for embedding computations.""" """Batch size for embedding computations."""
@ -406,14 +393,6 @@ class LightRAG:
) )
"""Strategy for enforcing source_id limits: IGNORE_NEW or FIFO.""" """Strategy for enforcing source_id limits: IGNORE_NEW or FIFO."""
max_file_paths: int = field(
default=get_env_value("MAX_FILE_PATHS", DEFAULT_MAX_FILE_PATHS, int)
)
"""Maximum number of file paths to store in entity/relation file_path field."""
file_path_more_placeholder: str = field(default=DEFAULT_FILE_PATH_MORE_PLACEHOLDER)
"""Placeholder text when file paths exceed max_file_paths limit."""
addon_params: dict[str, Any] = field( addon_params: dict[str, Any] = field(
default_factory=lambda: { default_factory=lambda: {
"language": get_env_value( "language": get_env_value(
@ -523,16 +502,6 @@ class LightRAG:
logger.debug(f"LightRAG init with param:\n {_print_config}\n") logger.debug(f"LightRAG init with param:\n {_print_config}\n")
# Init Embedding # Init Embedding
# Step 1: Capture max_token_size before applying decorator (decorator strips dataclass attributes)
embedding_max_token_size = None
if self.embedding_func and hasattr(self.embedding_func, "max_token_size"):
embedding_max_token_size = self.embedding_func.max_token_size
logger.debug(
f"Captured embedding max_token_size: {embedding_max_token_size}"
)
self.embedding_token_limit = embedding_max_token_size
# Step 2: Apply priority wrapper decorator
self.embedding_func = priority_limit_async_func_call( self.embedding_func = priority_limit_async_func_call(
self.embedding_func_max_async, self.embedding_func_max_async,
llm_timeout=self.default_embedding_timeout, llm_timeout=self.default_embedding_timeout,
@ -659,22 +628,6 @@ class LightRAG:
async def initialize_storages(self): async def initialize_storages(self):
"""Storage initialization must be called one by one to prevent deadlock""" """Storage initialization must be called one by one to prevent deadlock"""
if self._storages_status == StoragesStatus.CREATED: if self._storages_status == StoragesStatus.CREATED:
# Set the first initialized workspace will set the default workspace
# Allows namespace operation without specifying workspace for backward compatibility
default_workspace = get_default_workspace()
if default_workspace is None:
set_default_workspace(self.workspace)
elif default_workspace != self.workspace:
logger.warning(
f"Creating LightRAG instance with workspace='{self.workspace}' "
f"but default workspace is already set to '{default_workspace}'."
)
# Auto-initialize pipeline_status for this workspace
from lightrag.kg.shared_storage import initialize_pipeline_status
await initialize_pipeline_status(workspace=self.workspace)
for storage in ( for storage in (
self.full_docs, self.full_docs,
self.text_chunks, self.text_chunks,
@ -746,7 +699,7 @@ class LightRAG:
async def check_and_migrate_data(self): async def check_and_migrate_data(self):
"""Check if data migration is needed and perform migration if necessary""" """Check if data migration is needed and perform migration if necessary"""
async with get_data_init_lock(): async with get_data_init_lock(enable_logging=True):
try: try:
# Check if migration is needed: # Check if migration is needed:
# 1. chunk_entity_relation_graph has entities and relations (count > 0) # 1. chunk_entity_relation_graph has entities and relations (count > 0)
@ -924,13 +877,13 @@ class LightRAG:
need_entity_migration = await self.entity_chunks.is_empty() need_entity_migration = await self.entity_chunks.is_empty()
except Exception as exc: # pragma: no cover - defensive logging except Exception as exc: # pragma: no cover - defensive logging
logger.error(f"Failed to check entity chunks storage: {exc}") logger.error(f"Failed to check entity chunks storage: {exc}")
raise exc need_entity_migration = True
try: try:
need_relation_migration = await self.relation_chunks.is_empty() need_relation_migration = await self.relation_chunks.is_empty()
except Exception as exc: # pragma: no cover - defensive logging except Exception as exc: # pragma: no cover - defensive logging
logger.error(f"Failed to check relation chunks storage: {exc}") logger.error(f"Failed to check relation chunks storage: {exc}")
raise exc need_relation_migration = True
if not need_entity_migration and not need_relation_migration: if not need_entity_migration and not need_relation_migration:
return return
@ -1609,12 +1562,8 @@ class LightRAG:
""" """
# Get pipeline status shared data and lock # Get pipeline status shared data and lock
pipeline_status = await get_namespace_data( pipeline_status = await get_namespace_data("pipeline_status")
"pipeline_status", workspace=self.workspace pipeline_status_lock = get_pipeline_status_lock()
)
pipeline_status_lock = get_namespace_lock(
"pipeline_status", workspace=self.workspace
)
# Check if another process is already processing the queue # Check if another process is already processing the queue
async with pipeline_status_lock: async with pipeline_status_lock:
@ -1644,7 +1593,6 @@ class LightRAG:
"batchs": 0, # Total number of files to be processed "batchs": 0, # Total number of files to be processed
"cur_batch": 0, # Number of files already processed "cur_batch": 0, # Number of files already processed
"request_pending": False, # Clear any previous request "request_pending": False, # Clear any previous request
"cancellation_requested": False, # Initialize cancellation flag
"latest_message": "", "latest_message": "",
} }
) )
@ -1661,22 +1609,6 @@ class LightRAG:
try: try:
# Process documents until no more documents or requests # Process documents until no more documents or requests
while True: while True:
# Check for cancellation request at the start of main loop
async with pipeline_status_lock:
if pipeline_status.get("cancellation_requested", False):
# Clear pending request
pipeline_status["request_pending"] = False
# Celar cancellation flag
pipeline_status["cancellation_requested"] = False
log_message = "Pipeline cancelled by user"
logger.info(log_message)
pipeline_status["latest_message"] = log_message
pipeline_status["history_messages"].append(log_message)
# Exit directly, skipping request_pending check
return
if not to_process_docs: if not to_process_docs:
log_message = "All enqueued documents have been processed" log_message = "All enqueued documents have been processed"
logger.info(log_message) logger.info(log_message)
@ -1739,25 +1671,14 @@ class LightRAG:
semaphore: asyncio.Semaphore, semaphore: asyncio.Semaphore,
) -> None: ) -> None:
"""Process single document""" """Process single document"""
# Initialize variables at the start to prevent UnboundLocalError in error handling
file_path = "unknown_source"
current_file_number = 0
file_extraction_stage_ok = False file_extraction_stage_ok = False
processing_start_time = int(time.time())
first_stage_tasks = []
entity_relation_task = None
async with semaphore: async with semaphore:
nonlocal processed_count nonlocal processed_count
current_file_number = 0
# Initialize to prevent UnboundLocalError in error handling # Initialize to prevent UnboundLocalError in error handling
first_stage_tasks = [] first_stage_tasks = []
entity_relation_task = None entity_relation_task = None
try: try:
# Check for cancellation before starting document processing
async with pipeline_status_lock:
if pipeline_status.get("cancellation_requested", False):
raise PipelineCancelledException("User cancelled")
# Get file path from status document # Get file path from status document
file_path = getattr( file_path = getattr(
status_doc, "file_path", "unknown_source" status_doc, "file_path", "unknown_source"
@ -1796,28 +1717,7 @@ class LightRAG:
) )
content = content_data["content"] content = content_data["content"]
# Call chunking function, supporting both sync and async implementations # Generate chunks from document
chunking_result = self.chunking_func(
self.tokenizer,
content,
split_by_character,
split_by_character_only,
self.chunk_overlap_token_size,
self.chunk_token_size,
)
# If result is awaitable, await to get actual result
if inspect.isawaitable(chunking_result):
chunking_result = await chunking_result
# Validate return type
if not isinstance(chunking_result, (list, tuple)):
raise TypeError(
f"chunking_func must return a list or tuple of dicts, "
f"got {type(chunking_result)}"
)
# Build chunks dictionary
chunks: dict[str, Any] = { chunks: dict[str, Any] = {
compute_mdhash_id(dp["content"], prefix="chunk-"): { compute_mdhash_id(dp["content"], prefix="chunk-"): {
**dp, **dp,
@ -1825,7 +1725,14 @@ class LightRAG:
"file_path": file_path, # Add file path to each chunk "file_path": file_path, # Add file path to each chunk
"llm_cache_list": [], # Initialize empty LLM cache list for each chunk "llm_cache_list": [], # Initialize empty LLM cache list for each chunk
} }
for dp in chunking_result for dp in self.chunking_func(
self.tokenizer,
content,
split_by_character,
split_by_character_only,
self.chunk_overlap_token_size,
self.chunk_token_size,
)
} }
if not chunks: if not chunks:
@ -1834,11 +1741,6 @@ class LightRAG:
# Record processing start time # Record processing start time
processing_start_time = int(time.time()) processing_start_time = int(time.time())
# Check for cancellation before entity extraction
async with pipeline_status_lock:
if pipeline_status.get("cancellation_requested", False):
raise PipelineCancelledException("User cancelled")
# Process document in two stages # Process document in two stages
# Stage 1: Process text chunks and docs (parallel execution) # Stage 1: Process text chunks and docs (parallel execution)
doc_status_task = asyncio.create_task( doc_status_task = asyncio.create_task(
@ -1889,33 +1791,20 @@ class LightRAG:
chunks, pipeline_status, pipeline_status_lock chunks, pipeline_status, pipeline_status_lock
) )
) )
chunk_results = await entity_relation_task await entity_relation_task
file_extraction_stage_ok = True file_extraction_stage_ok = True
except Exception as e: except Exception as e:
# Check if this is a user cancellation # Log error and update pipeline status
if isinstance(e, PipelineCancelledException): logger.error(traceback.format_exc())
# User cancellation - log brief message only, no traceback error_msg = f"Failed to extract document {current_file_number}/{total_files}: {file_path}"
error_msg = f"User cancelled {current_file_number}/{total_files}: {file_path}" logger.error(error_msg)
logger.warning(error_msg) async with pipeline_status_lock:
async with pipeline_status_lock: pipeline_status["latest_message"] = error_msg
pipeline_status["latest_message"] = error_msg pipeline_status["history_messages"].append(
pipeline_status["history_messages"].append( traceback.format_exc()
error_msg )
) pipeline_status["history_messages"].append(error_msg)
else:
# Other exceptions - log with traceback
logger.error(traceback.format_exc())
error_msg = f"Failed to extract document {current_file_number}/{total_files}: {file_path}"
logger.error(error_msg)
async with pipeline_status_lock:
pipeline_status["latest_message"] = error_msg
pipeline_status["history_messages"].append(
traceback.format_exc()
)
pipeline_status["history_messages"].append(
error_msg
)
# Cancel tasks that are not yet completed # Cancel tasks that are not yet completed
all_tasks = first_stage_tasks + ( all_tasks = first_stage_tasks + (
@ -1925,14 +1814,9 @@ class LightRAG:
if task and not task.done(): if task and not task.done():
task.cancel() task.cancel()
# Persistent llm cache with error handling # Persistent llm cache
if self.llm_response_cache: if self.llm_response_cache:
try: await self.llm_response_cache.index_done_callback()
await self.llm_response_cache.index_done_callback()
except Exception as persist_error:
logger.error(
f"Failed to persist LLM cache: {persist_error}"
)
# Record processing end time for failed case # Record processing end time for failed case
processing_end_time = int(time.time()) processing_end_time = int(time.time())
@ -1962,16 +1846,8 @@ class LightRAG:
# Concurrency is controlled by keyed lock for individual entities and relationships # Concurrency is controlled by keyed lock for individual entities and relationships
if file_extraction_stage_ok: if file_extraction_stage_ok:
try: try:
# Check for cancellation before merge # Get chunk_results from entity_relation_task
async with pipeline_status_lock: chunk_results = await entity_relation_task
if pipeline_status.get(
"cancellation_requested", False
):
raise PipelineCancelledException(
"User cancelled"
)
# Use chunk_results from entity_relation_task
await merge_nodes_and_edges( await merge_nodes_and_edges(
chunk_results=chunk_results, # result collected from entity_relation_task chunk_results=chunk_results, # result collected from entity_relation_task
knowledge_graph_inst=self.chunk_entity_relation_graph, knowledge_graph_inst=self.chunk_entity_relation_graph,
@ -2028,38 +1904,22 @@ class LightRAG:
) )
except Exception as e: except Exception as e:
# Check if this is a user cancellation # Log error and update pipeline status
if isinstance(e, PipelineCancelledException): logger.error(traceback.format_exc())
# User cancellation - log brief message only, no traceback error_msg = f"Merging stage failed in document {current_file_number}/{total_files}: {file_path}"
error_msg = f"User cancelled during merge {current_file_number}/{total_files}: {file_path}" logger.error(error_msg)
logger.warning(error_msg) async with pipeline_status_lock:
async with pipeline_status_lock: pipeline_status["latest_message"] = error_msg
pipeline_status["latest_message"] = error_msg pipeline_status["history_messages"].append(
pipeline_status["history_messages"].append( traceback.format_exc()
error_msg )
) pipeline_status["history_messages"].append(
else: error_msg
# Other exceptions - log with traceback )
logger.error(traceback.format_exc())
error_msg = f"Merging stage failed in document {current_file_number}/{total_files}: {file_path}"
logger.error(error_msg)
async with pipeline_status_lock:
pipeline_status["latest_message"] = error_msg
pipeline_status["history_messages"].append(
traceback.format_exc()
)
pipeline_status["history_messages"].append(
error_msg
)
# Persistent llm cache with error handling # Persistent llm cache
if self.llm_response_cache: if self.llm_response_cache:
try: await self.llm_response_cache.index_done_callback()
await self.llm_response_cache.index_done_callback()
except Exception as persist_error:
logger.error(
f"Failed to persist LLM cache: {persist_error}"
)
# Record processing end time for failed case # Record processing end time for failed case
processing_end_time = int(time.time()) processing_end_time = int(time.time())
@ -2100,19 +1960,7 @@ class LightRAG:
) )
# Wait for all document processing to complete # Wait for all document processing to complete
try: await asyncio.gather(*doc_tasks)
await asyncio.gather(*doc_tasks)
except PipelineCancelledException:
# Cancel all remaining tasks
for task in doc_tasks:
if not task.done():
task.cancel()
# Wait for all tasks to complete cancellation
await asyncio.wait(doc_tasks, return_when=asyncio.ALL_COMPLETED)
# Exit directly (document statuses already updated in process_document)
return
# Check if there's a pending request to process more documents (with lock) # Check if there's a pending request to process more documents (with lock)
has_pending_request = False has_pending_request = False
@ -2143,14 +1991,11 @@ class LightRAG:
to_process_docs.update(pending_docs) to_process_docs.update(pending_docs)
finally: finally:
log_message = "Enqueued document processing pipeline stopped" log_message = "Enqueued document processing pipeline stoped"
logger.info(log_message) logger.info(log_message)
# Always reset busy status and cancellation flag when done or if an exception occurs (with lock) # Always reset busy status when done or if an exception occurs (with lock)
async with pipeline_status_lock: async with pipeline_status_lock:
pipeline_status["busy"] = False pipeline_status["busy"] = False
pipeline_status["cancellation_requested"] = (
False # Always reset cancellation flag
)
pipeline_status["latest_message"] = log_message pipeline_status["latest_message"] = log_message
pipeline_status["history_messages"].append(log_message) pipeline_status["history_messages"].append(log_message)
@ -2938,9 +2783,7 @@ class LightRAG:
# Return the dictionary containing statuses only for the found document IDs # Return the dictionary containing statuses only for the found document IDs
return found_statuses return found_statuses
async def adelete_by_doc_id( async def adelete_by_doc_id(self, doc_id: str) -> DeletionResult:
self, doc_id: str, delete_llm_cache: bool = False
) -> DeletionResult:
"""Delete a document and all its related data, including chunks, graph elements. """Delete a document and all its related data, including chunks, graph elements.
This method orchestrates a comprehensive deletion process for a given document ID. This method orchestrates a comprehensive deletion process for a given document ID.
@ -2950,8 +2793,6 @@ class LightRAG:
Args: Args:
doc_id (str): The unique identifier of the document to be deleted. doc_id (str): The unique identifier of the document to be deleted.
delete_llm_cache (bool): Whether to delete cached LLM extraction results
associated with the document. Defaults to False.
Returns: Returns:
DeletionResult: An object containing the outcome of the deletion process. DeletionResult: An object containing the outcome of the deletion process.
@ -2961,55 +2802,12 @@ class LightRAG:
- `status_code` (int): HTTP status code (e.g., 200, 404, 500). - `status_code` (int): HTTP status code (e.g., 200, 404, 500).
- `file_path` (str | None): The file path of the deleted document, if available. - `file_path` (str | None): The file path of the deleted document, if available.
""" """
# Get pipeline status shared data and lock for validation
pipeline_status = await get_namespace_data(
"pipeline_status", workspace=self.workspace
)
pipeline_status_lock = get_namespace_lock(
"pipeline_status", workspace=self.workspace
)
# Track whether WE acquired the pipeline
we_acquired_pipeline = False
# Check and acquire pipeline if needed
async with pipeline_status_lock:
if not pipeline_status.get("busy", False):
# Pipeline is idle - WE acquire it for this deletion
we_acquired_pipeline = True
pipeline_status.update(
{
"busy": True,
"job_name": "Deleting 1 document",
"job_start": datetime.now(timezone.utc).isoformat(),
"docs": 1,
"batchs": 1,
"cur_batch": 0,
"request_pending": False,
"cancellation_requested": False,
"latest_message": f"Starting deletion for document: {doc_id}",
}
)
# Initialize history messages
pipeline_status["history_messages"][:] = [
f"Starting deletion for document: {doc_id}"
]
else:
# Pipeline already busy - verify it's a deletion job
job_name = pipeline_status.get("job_name", "").lower()
if "deleting" not in job_name or "document" not in job_name:
return DeletionResult(
status="not_allowed",
doc_id=doc_id,
message=f"Deletion not allowed: current job '{pipeline_status.get('job_name')}' is not a document deletion job",
status_code=403,
file_path=None,
)
# Pipeline is busy with deletion - proceed without acquiring
deletion_operations_started = False deletion_operations_started = False
original_exception = None original_exception = None
doc_llm_cache_ids: list[str] = []
# Get pipeline status shared data and lock for status updates
pipeline_status = await get_namespace_data("pipeline_status")
pipeline_status_lock = get_pipeline_status_lock()
async with pipeline_status_lock: async with pipeline_status_lock:
log_message = f"Starting deletion process for document {doc_id}" log_message = f"Starting deletion process for document {doc_id}"
@ -3106,57 +2904,6 @@ class LightRAG:
# Mark that deletion operations have started # Mark that deletion operations have started
deletion_operations_started = True deletion_operations_started = True
if delete_llm_cache and chunk_ids:
if not self.llm_response_cache:
logger.info(
"Skipping LLM cache collection for document %s because cache storage is unavailable",
doc_id,
)
elif not self.text_chunks:
logger.info(
"Skipping LLM cache collection for document %s because text chunk storage is unavailable",
doc_id,
)
else:
try:
chunk_data_list = await self.text_chunks.get_by_ids(
list(chunk_ids)
)
seen_cache_ids: set[str] = set()
for chunk_data in chunk_data_list:
if not chunk_data or not isinstance(chunk_data, dict):
continue
cache_ids = chunk_data.get("llm_cache_list", [])
if not isinstance(cache_ids, list):
continue
for cache_id in cache_ids:
if (
isinstance(cache_id, str)
and cache_id
and cache_id not in seen_cache_ids
):
doc_llm_cache_ids.append(cache_id)
seen_cache_ids.add(cache_id)
if doc_llm_cache_ids:
logger.info(
"Collected %d LLM cache entries for document %s",
len(doc_llm_cache_ids),
doc_id,
)
else:
logger.info(
"No LLM cache entries found for document %s", doc_id
)
except Exception as cache_collect_error:
logger.error(
"Failed to collect LLM cache ids for document %s: %s",
doc_id,
cache_collect_error,
)
raise Exception(
f"Failed to collect LLM cache ids for document {doc_id}: {cache_collect_error}"
) from cache_collect_error
# 4. Analyze entities and relationships that will be affected # 4. Analyze entities and relationships that will be affected
entities_to_delete = set() entities_to_delete = set()
entities_to_rebuild = {} # entity_name -> remaining chunk id list entities_to_rebuild = {} # entity_name -> remaining chunk id list
@ -3242,9 +2989,6 @@ class LightRAG:
] ]
if not existing_sources: if not existing_sources:
# No chunk references means this entity should be deleted
entities_to_delete.add(node_label)
entity_chunk_updates[node_label] = []
continue continue
remaining_sources = subtract_source_ids(existing_sources, chunk_ids) remaining_sources = subtract_source_ids(existing_sources, chunk_ids)
@ -3266,7 +3010,6 @@ class LightRAG:
# Process relationships # Process relationships
for edge_data in affected_edges: for edge_data in affected_edges:
# source target is not in normalize order in graph db property
src = edge_data.get("source") src = edge_data.get("source")
tgt = edge_data.get("target") tgt = edge_data.get("target")
@ -3303,9 +3046,6 @@ class LightRAG:
] ]
if not existing_sources: if not existing_sources:
# No chunk references means this relationship should be deleted
relationships_to_delete.add(edge_tuple)
relation_chunk_updates[edge_tuple] = []
continue continue
remaining_sources = subtract_source_ids(existing_sources, chunk_ids) remaining_sources = subtract_source_ids(existing_sources, chunk_ids)
@ -3331,31 +3071,38 @@ class LightRAG:
if entity_chunk_updates and self.entity_chunks: if entity_chunk_updates and self.entity_chunks:
entity_upsert_payload = {} entity_upsert_payload = {}
entity_delete_ids: set[str] = set()
for entity_name, remaining in entity_chunk_updates.items(): for entity_name, remaining in entity_chunk_updates.items():
if not remaining: if not remaining:
# Empty entities are deleted alongside graph nodes later entity_delete_ids.add(entity_name)
continue else:
entity_upsert_payload[entity_name] = { entity_upsert_payload[entity_name] = {
"chunk_ids": remaining, "chunk_ids": remaining,
"count": len(remaining), "count": len(remaining),
"updated_at": current_time, "updated_at": current_time,
} }
if entity_delete_ids:
await self.entity_chunks.delete(list(entity_delete_ids))
if entity_upsert_payload: if entity_upsert_payload:
await self.entity_chunks.upsert(entity_upsert_payload) await self.entity_chunks.upsert(entity_upsert_payload)
if relation_chunk_updates and self.relation_chunks: if relation_chunk_updates and self.relation_chunks:
relation_upsert_payload = {} relation_upsert_payload = {}
relation_delete_ids: set[str] = set()
for edge_tuple, remaining in relation_chunk_updates.items(): for edge_tuple, remaining in relation_chunk_updates.items():
if not remaining:
# Empty relations are deleted alongside graph edges later
continue
storage_key = make_relation_chunk_key(*edge_tuple) storage_key = make_relation_chunk_key(*edge_tuple)
relation_upsert_payload[storage_key] = { if not remaining:
"chunk_ids": remaining, relation_delete_ids.add(storage_key)
"count": len(remaining), else:
"updated_at": current_time, relation_upsert_payload[storage_key] = {
} "chunk_ids": remaining,
"count": len(remaining),
"updated_at": current_time,
}
if relation_delete_ids:
await self.relation_chunks.delete(list(relation_delete_ids))
if relation_upsert_payload: if relation_upsert_payload:
await self.relation_chunks.upsert(relation_upsert_payload) await self.relation_chunks.upsert(relation_upsert_payload)
@ -3363,111 +3110,56 @@ class LightRAG:
logger.error(f"Failed to process graph analysis results: {e}") logger.error(f"Failed to process graph analysis results: {e}")
raise Exception(f"Failed to process graph dependencies: {e}") from e raise Exception(f"Failed to process graph dependencies: {e}") from e
# Data integrity is ensured by allowing only one process to hold pipeline at a timeno graph db lock is needed anymore) # Use graph database lock to prevent dirty read
graph_db_lock = get_graph_db_lock(enable_logging=False)
async with graph_db_lock:
# 5. Delete chunks from storage
if chunk_ids:
try:
await self.chunks_vdb.delete(chunk_ids)
await self.text_chunks.delete(chunk_ids)
# 5. Delete chunks from storage async with pipeline_status_lock:
if chunk_ids: log_message = f"Successfully deleted {len(chunk_ids)} chunks from storage"
try: logger.info(log_message)
await self.chunks_vdb.delete(chunk_ids) pipeline_status["latest_message"] = log_message
await self.text_chunks.delete(chunk_ids) pipeline_status["history_messages"].append(log_message)
async with pipeline_status_lock: except Exception as e:
log_message = ( logger.error(f"Failed to delete chunks: {e}")
f"Successfully deleted {len(chunk_ids)} chunks from storage" raise Exception(f"Failed to delete document chunks: {e}") from e
)
logger.info(log_message)
pipeline_status["latest_message"] = log_message
pipeline_status["history_messages"].append(log_message)
except Exception as e: # 6. Delete entities that have no remaining sources
logger.error(f"Failed to delete chunks: {e}") if entities_to_delete:
raise Exception(f"Failed to delete document chunks: {e}") from e try:
# Delete from vector database
# 6. Delete relationships that have no remaining sources entity_vdb_ids = [
if relationships_to_delete: compute_mdhash_id(entity, prefix="ent-")
try: for entity in entities_to_delete
# Delete from relation vdb
rel_ids_to_delete = []
for src, tgt in relationships_to_delete:
rel_ids_to_delete.extend(
[
compute_mdhash_id(src + tgt, prefix="rel-"),
compute_mdhash_id(tgt + src, prefix="rel-"),
]
)
await self.relationships_vdb.delete(rel_ids_to_delete)
# Delete from graph
await self.chunk_entity_relation_graph.remove_edges(
list(relationships_to_delete)
)
# Delete from relation_chunks storage
if self.relation_chunks:
relation_storage_keys = [
make_relation_chunk_key(src, tgt)
for src, tgt in relationships_to_delete
] ]
await self.relation_chunks.delete(relation_storage_keys) await self.entities_vdb.delete(entity_vdb_ids)
async with pipeline_status_lock: # Delete from graph
log_message = f"Successfully deleted {len(relationships_to_delete)} relations" await self.chunk_entity_relation_graph.remove_nodes(
logger.info(log_message)
pipeline_status["latest_message"] = log_message
pipeline_status["history_messages"].append(log_message)
except Exception as e:
logger.error(f"Failed to delete relationships: {e}")
raise Exception(f"Failed to delete relationships: {e}") from e
# 7. Delete entities that have no remaining sources
if entities_to_delete:
try:
# Batch get all edges for entities to avoid N+1 query problem
nodes_edges_dict = (
await self.chunk_entity_relation_graph.get_nodes_edges_batch(
list(entities_to_delete) list(entities_to_delete)
) )
)
# Debug: Check and log all edges before deleting nodes async with pipeline_status_lock:
edges_to_delete = set() log_message = f"Successfully deleted {len(entities_to_delete)} entities"
edges_still_exist = 0 logger.info(log_message)
pipeline_status["latest_message"] = log_message
pipeline_status["history_messages"].append(log_message)
for entity, edges in nodes_edges_dict.items(): except Exception as e:
if edges: logger.error(f"Failed to delete entities: {e}")
for src, tgt in edges: raise Exception(f"Failed to delete entities: {e}") from e
# Normalize edge representation (sorted for consistency)
edge_tuple = tuple(sorted((src, tgt)))
edges_to_delete.add(edge_tuple)
if ( # 7. Delete relationships that have no remaining sources
src in entities_to_delete if relationships_to_delete:
and tgt in entities_to_delete try:
): # Delete from vector database
logger.warning(
f"Edge still exists: {src} <-> {tgt}"
)
elif src in entities_to_delete:
logger.warning(
f"Edge still exists: {src} --> {tgt}"
)
else:
logger.warning(
f"Edge still exists: {src} <-- {tgt}"
)
edges_still_exist += 1
if edges_still_exist:
logger.warning(
f"⚠️ {edges_still_exist} entities still has edges before deletion"
)
# Clean residual edges from VDB and storage before deleting nodes
if edges_to_delete:
# Delete from relationships_vdb
rel_ids_to_delete = [] rel_ids_to_delete = []
for src, tgt in edges_to_delete: for src, tgt in relationships_to_delete:
rel_ids_to_delete.extend( rel_ids_to_delete.extend(
[ [
compute_mdhash_id(src + tgt, prefix="rel-"), compute_mdhash_id(src + tgt, prefix="rel-"),
@ -3476,53 +3168,28 @@ class LightRAG:
) )
await self.relationships_vdb.delete(rel_ids_to_delete) await self.relationships_vdb.delete(rel_ids_to_delete)
# Delete from relation_chunks storage # Delete from graph
if self.relation_chunks: await self.chunk_entity_relation_graph.remove_edges(
relation_storage_keys = [ list(relationships_to_delete)
make_relation_chunk_key(src, tgt)
for src, tgt in edges_to_delete
]
await self.relation_chunks.delete(relation_storage_keys)
logger.info(
f"Cleaned {len(edges_to_delete)} residual edges from VDB and chunk-tracking storage"
) )
# Delete from graph (edges will be auto-deleted with nodes) async with pipeline_status_lock:
await self.chunk_entity_relation_graph.remove_nodes( log_message = f"Successfully deleted {len(relationships_to_delete)} relations"
list(entities_to_delete) logger.info(log_message)
) pipeline_status["latest_message"] = log_message
pipeline_status["history_messages"].append(log_message)
# Delete from vector vdb except Exception as e:
entity_vdb_ids = [ logger.error(f"Failed to delete relationships: {e}")
compute_mdhash_id(entity, prefix="ent-") raise Exception(f"Failed to delete relationships: {e}") from e
for entity in entities_to_delete
]
await self.entities_vdb.delete(entity_vdb_ids)
# Delete from entity_chunks storage # Persist changes to graph database before releasing graph database lock
if self.entity_chunks: await self._insert_done()
await self.entity_chunks.delete(list(entities_to_delete))
async with pipeline_status_lock:
log_message = (
f"Successfully deleted {len(entities_to_delete)} entities"
)
logger.info(log_message)
pipeline_status["latest_message"] = log_message
pipeline_status["history_messages"].append(log_message)
except Exception as e:
logger.error(f"Failed to delete entities: {e}")
raise Exception(f"Failed to delete entities: {e}") from e
# Persist changes to graph database before entity and relationship rebuild
await self._insert_done()
# 8. Rebuild entities and relationships from remaining chunks # 8. Rebuild entities and relationships from remaining chunks
if entities_to_rebuild or relationships_to_rebuild: if entities_to_rebuild or relationships_to_rebuild:
try: try:
await rebuild_knowledge_from_chunks( await _rebuild_knowledge_from_chunks(
entities_to_rebuild=entities_to_rebuild, entities_to_rebuild=entities_to_rebuild,
relationships_to_rebuild=relationships_to_rebuild, relationships_to_rebuild=relationships_to_rebuild,
knowledge_graph_inst=self.chunk_entity_relation_graph, knowledge_graph_inst=self.chunk_entity_relation_graph,
@ -3559,23 +3226,6 @@ class LightRAG:
logger.error(f"Failed to delete document and status: {e}") logger.error(f"Failed to delete document and status: {e}")
raise Exception(f"Failed to delete document and status: {e}") from e raise Exception(f"Failed to delete document and status: {e}") from e
if delete_llm_cache and doc_llm_cache_ids and self.llm_response_cache:
try:
await self.llm_response_cache.delete(doc_llm_cache_ids)
cache_log_message = f"Successfully deleted {len(doc_llm_cache_ids)} LLM cache entries for document {doc_id}"
logger.info(cache_log_message)
async with pipeline_status_lock:
pipeline_status["latest_message"] = cache_log_message
pipeline_status["history_messages"].append(cache_log_message)
log_message = cache_log_message
except Exception as cache_delete_error:
log_message = f"Failed to delete LLM cache for document {doc_id}: {cache_delete_error}"
logger.error(log_message)
logger.error(traceback.format_exc())
async with pipeline_status_lock:
pipeline_status["latest_message"] = log_message
pipeline_status["history_messages"].append(log_message)
return DeletionResult( return DeletionResult(
status="success", status="success",
doc_id=doc_id, doc_id=doc_id,
@ -3623,18 +3273,6 @@ class LightRAG:
f"No deletion operations were started for document {doc_id}, skipping persistence" f"No deletion operations were started for document {doc_id}, skipping persistence"
) )
# Release pipeline only if WE acquired it
if we_acquired_pipeline:
async with pipeline_status_lock:
pipeline_status["busy"] = False
pipeline_status["cancellation_requested"] = False
completion_msg = (
f"Deletion process completed for document: {doc_id}"
)
pipeline_status["latest_message"] = completion_msg
pipeline_status["history_messages"].append(completion_msg)
logger.info(completion_msg)
async def adelete_by_entity(self, entity_name: str) -> DeletionResult: async def adelete_by_entity(self, entity_name: str) -> DeletionResult:
"""Asynchronously delete an entity and all its relationships. """Asynchronously delete an entity and all its relationships.
@ -3752,22 +3390,16 @@ class LightRAG:
) )
async def aedit_entity( async def aedit_entity(
self, self, entity_name: str, updated_data: dict[str, str], allow_rename: bool = True
entity_name: str,
updated_data: dict[str, str],
allow_rename: bool = True,
allow_merge: bool = False,
) -> dict[str, Any]: ) -> dict[str, Any]:
"""Asynchronously edit entity information. """Asynchronously edit entity information.
Updates entity information in the knowledge graph and re-embeds the entity in the vector database. Updates entity information in the knowledge graph and re-embeds the entity in the vector database.
Also synchronizes entity_chunks_storage and relation_chunks_storage to track chunk references.
Args: Args:
entity_name: Name of the entity to edit entity_name: Name of the entity to edit
updated_data: Dictionary containing updated attributes, e.g. {"description": "new description", "entity_type": "new type"} updated_data: Dictionary containing updated attributes, e.g. {"description": "new description", "entity_type": "new type"}
allow_rename: Whether to allow entity renaming, defaults to True allow_rename: Whether to allow entity renaming, defaults to True
allow_merge: Whether to merge into an existing entity when renaming to an existing name
Returns: Returns:
Dictionary containing updated entity information Dictionary containing updated entity information
@ -3781,21 +3413,14 @@ class LightRAG:
entity_name, entity_name,
updated_data, updated_data,
allow_rename, allow_rename,
allow_merge,
self.entity_chunks,
self.relation_chunks,
) )
def edit_entity( def edit_entity(
self, self, entity_name: str, updated_data: dict[str, str], allow_rename: bool = True
entity_name: str,
updated_data: dict[str, str],
allow_rename: bool = True,
allow_merge: bool = False,
) -> dict[str, Any]: ) -> dict[str, Any]:
loop = always_get_an_event_loop() loop = always_get_an_event_loop()
return loop.run_until_complete( return loop.run_until_complete(
self.aedit_entity(entity_name, updated_data, allow_rename, allow_merge) self.aedit_entity(entity_name, updated_data, allow_rename)
) )
async def aedit_relation( async def aedit_relation(
@ -3804,7 +3429,6 @@ class LightRAG:
"""Asynchronously edit relation information. """Asynchronously edit relation information.
Updates relation (edge) information in the knowledge graph and re-embeds the relation in the vector database. Updates relation (edge) information in the knowledge graph and re-embeds the relation in the vector database.
Also synchronizes the relation_chunks_storage to track which chunks reference this relation.
Args: Args:
source_entity: Name of the source entity source_entity: Name of the source entity
@ -3823,7 +3447,6 @@ class LightRAG:
source_entity, source_entity,
target_entity, target_entity,
updated_data, updated_data,
self.relation_chunks,
) )
def edit_relation( def edit_relation(
@ -3935,8 +3558,6 @@ class LightRAG:
target_entity, target_entity,
merge_strategy, merge_strategy,
target_entity_data, target_entity_data,
self.entity_chunks,
self.relation_chunks,
) )
def merge_entities( def merge_entities(

View file

@ -10,8 +10,8 @@ class NameSpace:
KV_STORE_LLM_RESPONSE_CACHE = "llm_response_cache" KV_STORE_LLM_RESPONSE_CACHE = "llm_response_cache"
KV_STORE_FULL_ENTITIES = "full_entities" KV_STORE_FULL_ENTITIES = "full_entities"
KV_STORE_FULL_RELATIONS = "full_relations" KV_STORE_FULL_RELATIONS = "full_relations"
KV_STORE_TENANTS = "tenants" KV_STORE_ENTITY_CHUNKS = "entity_chunks"
KV_STORE_KNOWLEDGE_BASES = "knowledge_bases" KV_STORE_RELATION_CHUNKS = "relation_chunks"
VECTOR_STORE_ENTITIES = "entities" VECTOR_STORE_ENTITIES = "entities"
VECTOR_STORE_RELATIONSHIPS = "relationships" VECTOR_STORE_RELATIONSHIPS = "relationships"

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,6 @@
from __future__ import annotations from __future__ import annotations
import weakref import weakref
import sys
import asyncio import asyncio
import html import html
import csv import csv
@ -37,40 +35,12 @@ from lightrag.constants import (
DEFAULT_LOG_FILENAME, DEFAULT_LOG_FILENAME,
GRAPH_FIELD_SEP, GRAPH_FIELD_SEP,
DEFAULT_MAX_TOTAL_TOKENS, DEFAULT_MAX_TOTAL_TOKENS,
DEFAULT_MAX_FILE_PATH_LENGTH,
DEFAULT_SOURCE_IDS_LIMIT_METHOD, DEFAULT_SOURCE_IDS_LIMIT_METHOD,
VALID_SOURCE_IDS_LIMIT_METHODS, VALID_SOURCE_IDS_LIMIT_METHODS,
SOURCE_IDS_LIMIT_METHOD_FIFO, SOURCE_IDS_LIMIT_METHOD_FIFO,
) )
# Precompile regex pattern for JSON sanitization (module-level, compiled once)
_SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]")
class SafeStreamHandler(logging.StreamHandler):
"""StreamHandler that gracefully handles closed streams during shutdown.
This handler prevents "ValueError: I/O operation on closed file" errors
that can occur when pytest or other test frameworks close stdout/stderr
before Python's logging cleanup runs.
"""
def flush(self):
"""Flush the stream, ignoring errors if the stream is closed."""
try:
super().flush()
except (ValueError, OSError):
# Stream is closed or otherwise unavailable, silently ignore
pass
def close(self):
"""Close the handler, ignoring errors if the stream is already closed."""
try:
super().close()
except (ValueError, OSError):
# Stream is closed or otherwise unavailable, silently ignore
pass
# Initialize logger with basic configuration # Initialize logger with basic configuration
logger = logging.getLogger("lightrag") logger = logging.getLogger("lightrag")
logger.propagate = False # prevent log message send to root logger logger.propagate = False # prevent log message send to root logger
@ -78,7 +48,7 @@ logger.setLevel(logging.INFO)
# Add console handler if no handlers exist # Add console handler if no handlers exist
if not logger.handlers: if not logger.handlers:
console_handler = SafeStreamHandler() console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO) console_handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(levelname)s: %(message)s") formatter = logging.Formatter("%(levelname)s: %(message)s")
console_handler.setFormatter(formatter) console_handler.setFormatter(formatter)
@ -87,33 +57,6 @@ if not logger.handlers:
# Set httpx logging level to WARNING # Set httpx logging level to WARNING
logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpx").setLevel(logging.WARNING)
def _patch_ascii_colors_console_handler() -> None:
"""Prevent ascii_colors from printing flush errors during interpreter exit."""
try:
from ascii_colors import ConsoleHandler
except ImportError:
return
if getattr(ConsoleHandler, "_lightrag_patched", False):
return
original_handle_error = ConsoleHandler.handle_error
def _safe_handle_error(self, message: str) -> None: # type: ignore[override]
exc_type, _, _ = sys.exc_info()
if exc_type in (ValueError, OSError) and "close" in message.lower():
return
original_handle_error(self, message)
ConsoleHandler.handle_error = _safe_handle_error # type: ignore[assignment]
ConsoleHandler._lightrag_patched = True # type: ignore[attr-defined]
_patch_ascii_colors_console_handler()
# Global import for pypinyin with startup-time logging # Global import for pypinyin with startup-time logging
try: try:
import pypinyin import pypinyin
@ -341,8 +284,8 @@ def setup_logger(
logger_instance.handlers = [] # Clear existing handlers logger_instance.handlers = [] # Clear existing handlers
logger_instance.propagate = False logger_instance.propagate = False
# Add console handler with safe stream handling # Add console handler
console_handler = SafeStreamHandler() console_handler = logging.StreamHandler()
console_handler.setFormatter(simple_formatter) console_handler.setFormatter(simple_formatter)
console_handler.setLevel(level) console_handler.setLevel(level)
logger_instance.addHandler(console_handler) logger_instance.addHandler(console_handler)
@ -408,69 +351,12 @@ class TaskState:
@dataclass @dataclass
class EmbeddingFunc: class EmbeddingFunc:
"""Embedding function wrapper with dimension validation
This class wraps an embedding function to ensure that the output embeddings have the correct dimension.
This class should not be wrapped multiple times.
Args:
embedding_dim: Expected dimension of the embeddings
func: The actual embedding function to wrap
max_token_size: Optional token limit for the embedding model
send_dimensions: Whether to inject embedding_dim as a keyword argument
"""
embedding_dim: int embedding_dim: int
func: callable func: callable
max_token_size: int | None = None # Token limit for the embedding model max_token_size: int | None = None # deprecated keep it for compatible only
send_dimensions: bool = (
False # Control whether to send embedding_dim to the function
)
async def __call__(self, *args, **kwargs) -> np.ndarray: async def __call__(self, *args, **kwargs) -> np.ndarray:
# Only inject embedding_dim when send_dimensions is True return await self.func(*args, **kwargs)
if self.send_dimensions:
# Check if user provided embedding_dim parameter
if "embedding_dim" in kwargs:
user_provided_dim = kwargs["embedding_dim"]
# If user's value differs from class attribute, output warning
if (
user_provided_dim is not None
and user_provided_dim != self.embedding_dim
):
logger.warning(
f"Ignoring user-provided embedding_dim={user_provided_dim}, "
f"using declared embedding_dim={self.embedding_dim} from decorator"
)
# Inject embedding_dim from decorator
kwargs["embedding_dim"] = self.embedding_dim
# Call the actual embedding function
result = await self.func(*args, **kwargs)
# Validate embedding dimensions using total element count
total_elements = result.size # Total number of elements in the numpy array
expected_dim = self.embedding_dim
# Check if total elements can be evenly divided by embedding_dim
if total_elements % expected_dim != 0:
raise ValueError(
f"Embedding dimension mismatch detected: "
f"total elements ({total_elements}) cannot be evenly divided by "
f"expected dimension ({expected_dim}). "
)
# Optional: Verify vector count matches input text count
actual_vectors = total_elements // expected_dim
if args and isinstance(args[0], (list, tuple)):
expected_vectors = len(args[0])
if actual_vectors != expected_vectors:
raise ValueError(
f"Vector count mismatch: "
f"expected {expected_vectors} vectors but got {actual_vectors} vectors (from embedding result)."
)
return result
def compute_args_hash(*args: Any) -> str: def compute_args_hash(*args: Any) -> str:
@ -1021,123 +907,9 @@ def load_json(file_name):
return json.load(f) return json.load(f)
def _sanitize_string_for_json(text: str) -> str:
"""Remove characters that cannot be encoded in UTF-8 for JSON serialization.
Uses regex for optimal performance with zero-copy optimization for clean strings.
Fast detection path for clean strings (99% of cases) with efficient removal for dirty strings.
Args:
text: String to sanitize
Returns:
Original string if clean (zero-copy), sanitized string if dirty
"""
if not text:
return text
# Fast path: Check if sanitization is needed using C-level regex search
if not _SURROGATE_PATTERN.search(text):
return text # Zero-copy for clean strings - most common case
# Slow path: Remove problematic characters using C-level regex substitution
return _SURROGATE_PATTERN.sub("", text)
class SanitizingJSONEncoder(json.JSONEncoder):
"""
Custom JSON encoder that sanitizes data during serialization.
This encoder cleans strings during the encoding process without creating
a full copy of the data structure, making it memory-efficient for large datasets.
"""
def encode(self, o):
"""Override encode method to handle simple string cases"""
if isinstance(o, str):
return json.encoder.encode_basestring(_sanitize_string_for_json(o))
return super().encode(o)
def iterencode(self, o, _one_shot=False):
"""
Override iterencode to sanitize strings during serialization.
This is the core method that handles complex nested structures.
"""
# Preprocess: sanitize all strings in the object
sanitized = self._sanitize_for_encoding(o)
# Call parent's iterencode with sanitized data
for chunk in super().iterencode(sanitized, _one_shot):
yield chunk
def _sanitize_for_encoding(self, obj):
"""
Recursively sanitize strings in an object.
Creates new objects only when necessary to avoid deep copies.
Args:
obj: Object to sanitize
Returns:
Sanitized object with cleaned strings
"""
if isinstance(obj, str):
return _sanitize_string_for_json(obj)
elif isinstance(obj, dict):
# Create new dict with sanitized keys and values
new_dict = {}
for k, v in obj.items():
clean_k = _sanitize_string_for_json(k) if isinstance(k, str) else k
clean_v = self._sanitize_for_encoding(v)
new_dict[clean_k] = clean_v
return new_dict
elif isinstance(obj, (list, tuple)):
# Sanitize list/tuple elements
cleaned = [self._sanitize_for_encoding(item) for item in obj]
return type(obj)(cleaned) if isinstance(obj, tuple) else cleaned
else:
# Numbers, booleans, None, etc. remain unchanged
return obj
def write_json(json_obj, file_name): def write_json(json_obj, file_name):
"""
Write JSON data to file with optimized sanitization strategy.
This function uses a two-stage approach:
1. Fast path: Try direct serialization (works for clean data ~99% of time)
2. Slow path: Use custom encoder that sanitizes during serialization
The custom encoder approach avoids creating a deep copy of the data,
making it memory-efficient. When sanitization occurs, the caller should
reload the cleaned data from the file to update shared memory.
Args:
json_obj: Object to serialize (may be a shallow copy from shared memory)
file_name: Output file path
Returns:
bool: True if sanitization was applied (caller should reload data),
False if direct write succeeded (no reload needed)
"""
try:
# Strategy 1: Fast path - try direct serialization
with open(file_name, "w", encoding="utf-8") as f:
json.dump(json_obj, f, indent=2, ensure_ascii=False)
return False # No sanitization needed, no reload required
except (UnicodeEncodeError, UnicodeDecodeError) as e:
logger.debug(f"Direct JSON write failed, using sanitizing encoder: {e}")
# Strategy 2: Use custom encoder (sanitizes during serialization, zero memory copy)
with open(file_name, "w", encoding="utf-8") as f: with open(file_name, "w", encoding="utf-8") as f:
json.dump(json_obj, f, indent=2, ensure_ascii=False, cls=SanitizingJSONEncoder) json.dump(json_obj, f, indent=2, ensure_ascii=False)
logger.info(f"JSON sanitization applied during write: {file_name}")
return True # Sanitization applied, reload recommended
class TokenizerInterface(Protocol): class TokenizerInterface(Protocol):
@ -2024,7 +1796,7 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
- Filter out short numeric-only text (length < 3 and only digits/dots) - Filter out short numeric-only text (length < 3 and only digits/dots)
- remove_inner_quotes = True - remove_inner_quotes = True
remove Chinese quotes remove Chinese quotes
remove English quotes in and around chinese remove English queotes in and around chinese
Convert non-breaking spaces to regular spaces Convert non-breaking spaces to regular spaces
Convert narrow non-breaking spaces after non-digits to regular spaces Convert narrow non-breaking spaces after non-digits to regular spaces
@ -2780,52 +2552,6 @@ def apply_source_ids_limit(
return truncated return truncated
def compute_incremental_chunk_ids(
existing_full_chunk_ids: list[str],
old_chunk_ids: list[str],
new_chunk_ids: list[str],
) -> list[str]:
"""
Compute incrementally updated chunk IDs based on changes.
This function applies delta changes (additions and removals) to an existing
list of chunk IDs while maintaining order and ensuring deduplication.
Delta additions from new_chunk_ids are placed at the end.
Args:
existing_full_chunk_ids: Complete list of existing chunk IDs from storage
old_chunk_ids: Previous chunk IDs from source_id (chunks being replaced)
new_chunk_ids: New chunk IDs from updated source_id (chunks being added)
Returns:
Updated list of chunk IDs with deduplication
Example:
>>> existing = ['chunk-1', 'chunk-2', 'chunk-3']
>>> old = ['chunk-1', 'chunk-2']
>>> new = ['chunk-2', 'chunk-4']
>>> compute_incremental_chunk_ids(existing, old, new)
['chunk-3', 'chunk-2', 'chunk-4']
"""
# Calculate changes
chunks_to_remove = set(old_chunk_ids) - set(new_chunk_ids)
chunks_to_add = set(new_chunk_ids) - set(old_chunk_ids)
# Apply changes to full chunk_ids
# Step 1: Remove chunks that are no longer needed
updated_chunk_ids = [
cid for cid in existing_full_chunk_ids if cid not in chunks_to_remove
]
# Step 2: Add new chunks (preserving order from new_chunk_ids)
# Note: 'cid not in updated_chunk_ids' check ensures deduplication
for cid in new_chunk_ids:
if cid in chunks_to_add and cid not in updated_chunk_ids:
updated_chunk_ids.append(cid)
return updated_chunk_ids
def subtract_source_ids( def subtract_source_ids(
source_ids: Iterable[str], source_ids: Iterable[str],
ids_to_remove: Collection[str], ids_to_remove: Collection[str],
@ -2858,6 +2584,65 @@ def parse_relation_chunk_key(key: str) -> tuple[str, str]:
return parts[0], parts[1] return parts[0], parts[1]
def build_file_path(already_file_paths, data_list, target):
"""Build file path string with UTF-8 byte length limit and deduplication
Args:
already_file_paths: List of existing file paths
data_list: List of data items containing file_path
target: Target name for logging warnings
Returns:
str: Combined file paths separated by GRAPH_FIELD_SEP
"""
# set: deduplication
file_paths_set = {fp for fp in already_file_paths if fp}
# string: filter empty value and keep file order in already_file_paths
file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
# Check if initial file_paths already exceeds byte length limit
if len(file_paths.encode("utf-8")) >= DEFAULT_MAX_FILE_PATH_LENGTH:
logger.warning(
f"Initial file_paths already exceeds {DEFAULT_MAX_FILE_PATH_LENGTH} bytes for {target}, "
f"current size: {len(file_paths.encode('utf-8'))} bytes"
)
# ignored file_paths
file_paths_ignore = ""
# add file_paths
for dp in data_list:
cur_file_path = dp.get("file_path")
# empty
if not cur_file_path:
continue
# skip duplicate item
if cur_file_path in file_paths_set:
continue
# add
file_paths_set.add(cur_file_path)
# check the UTF-8 byte length
new_addition = GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path
if (
len(file_paths.encode("utf-8")) + len(new_addition.encode("utf-8"))
< DEFAULT_MAX_FILE_PATH_LENGTH - 5
):
# append
file_paths += new_addition
else:
# ignore
file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path
if file_paths_ignore:
logger.warning(
f"File paths exceed {DEFAULT_MAX_FILE_PATH_LENGTH} bytes for {target}, "
f"ignoring file path: {file_paths_ignore}"
)
return file_paths
def generate_track_id(prefix: str = "upload") -> str: def generate_track_id(prefix: str = "upload") -> str:
"""Generate a unique tracking ID with timestamp and UUID """Generate a unique tracking ID with timestamp and UUID