From 896e20357437a3c06b9ba36f2543c49fa4639d3b Mon Sep 17 00:00:00 2001 From: captainmirk <77876380+captainmirk@users.noreply.github.com> Date: Wed, 3 Dec 2025 00:20:40 +0000 Subject: [PATCH 1/3] feat(core): implement safety check for embedding dimension mismatch --- lightrag/lightrag.py | 56 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 8a638759..b9b50ec9 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -5,6 +5,7 @@ import asyncio import configparser import inspect import os +import json import time import warnings from dataclasses import asdict, dataclass, field @@ -524,8 +525,8 @@ class LightRAG: _print_config = ",\n ".join([f"{k} = {v}" for k, v in global_config.items()]) logger.debug(f"LightRAG init with param:\n {_print_config}\n") - # Init Embedding - # Step 1: Capture max_token_size before applying decorator (decorator strips dataclass attributes) + # Init Embedding + # Step 1: Capture max_token_size and embedding_dim before applying decorator embedding_max_token_size = None if self.embedding_func and hasattr(self.embedding_func, "max_token_size"): embedding_max_token_size = self.embedding_func.max_token_size @@ -534,6 +535,12 @@ class LightRAG: ) self.embedding_token_limit = embedding_max_token_size + # --- CAPTURE EMBEDDING DIMENSION (NEW) --- + self.embedding_dim = None + if self.embedding_func and hasattr(self.embedding_func, "embedding_dim"): + self.embedding_dim = self.embedding_func.embedding_dim + # ----------------------------------------- + # Step 2: Apply priority wrapper decorator self.embedding_func = priority_limit_async_func_call( self.embedding_func_max_async, @@ -658,8 +665,53 @@ class LightRAG: self._storages_status = StoragesStatus.CREATED + def _check_embedding_config(self): + """ + Validates that the current embedding dimension matches the existing data. + Prevents data corruption when switching models without clearing storage. + """ + if self.embedding_dim is None: + # If we couldn't capture dimensions, skip the check to avoid blocking valid custom models + return + + meta_file = os.path.join(self.working_dir, "lightrag_meta.json") + + if os.path.exists(meta_file): + with open(meta_file, "r", encoding="utf-8") as f: + try: + meta_data = json.load(f) + saved_dim = meta_data.get("embedding_dim") + saved_model = meta_data.get("embedding_model_func", "unknown") + + if saved_dim and saved_dim != self.embedding_dim: + raise ValueError( + f"Embedding dimension mismatch! " + f"Existing data uses dimension {saved_dim} (Model: {saved_model}), " + f"but current configuration uses {self.embedding_dim}. " + f"Please clear the '{self.working_dir}' directory or switch back to the original model." + ) + except json.JSONDecodeError: + logger.warning(f"Could not parse {meta_file}. Skipping dimension check.") + else: + # First run: Save the metadata + meta_data = { + "embedding_dim": self.embedding_dim, + "embedding_model_func": self.embedding_func.__class__.__name__ if self.embedding_func else "unknown", + "created_at": str(os.path.abspath(self.working_dir)) + } + # Ensure directory exists + if not os.path.exists(self.working_dir): + os.makedirs(self.working_dir) + + with open(meta_file, "w", encoding="utf-8") as f: + json.dump(meta_data, f, indent=4) + async def initialize_storages(self): """Storage initialization must be called one by one to prevent deadlock""" + # --- NEW SAFETY CHECK CALL --- + self._check_embedding_config() + # ----------------------------- + if self._storages_status == StoragesStatus.CREATED: # Set the first initialized workspace will set the default workspace # Allows namespace operation without specifying workspace for backward compatibility From c806694379deb2bfc00687aff2588062e895928d Mon Sep 17 00:00:00 2001 From: captainmirk <77876380+captainmirk@users.noreply.github.com> Date: Wed, 3 Dec 2025 11:36:06 +0000 Subject: [PATCH 2/3] style: fix linting and trailing whitespace --- lightrag/lightrag.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index b9b50ec9..d8be53d8 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -525,7 +525,7 @@ class LightRAG: _print_config = ",\n ".join([f"{k} = {v}" for k, v in global_config.items()]) logger.debug(f"LightRAG init with param:\n {_print_config}\n") - # Init Embedding + # Init Embedding # Step 1: Capture max_token_size and embedding_dim before applying decorator embedding_max_token_size = None if self.embedding_func and hasattr(self.embedding_func, "max_token_size"): @@ -675,14 +675,14 @@ class LightRAG: return meta_file = os.path.join(self.working_dir, "lightrag_meta.json") - + if os.path.exists(meta_file): with open(meta_file, "r", encoding="utf-8") as f: try: meta_data = json.load(f) saved_dim = meta_data.get("embedding_dim") saved_model = meta_data.get("embedding_model_func", "unknown") - + if saved_dim and saved_dim != self.embedding_dim: raise ValueError( f"Embedding dimension mismatch! " @@ -691,18 +691,22 @@ class LightRAG: f"Please clear the '{self.working_dir}' directory or switch back to the original model." ) except json.JSONDecodeError: - logger.warning(f"Could not parse {meta_file}. Skipping dimension check.") + logger.warning( + f"Could not parse {meta_file}. Skipping dimension check." + ) else: # First run: Save the metadata meta_data = { "embedding_dim": self.embedding_dim, - "embedding_model_func": self.embedding_func.__class__.__name__ if self.embedding_func else "unknown", - "created_at": str(os.path.abspath(self.working_dir)) + "embedding_model_func": self.embedding_func.__class__.__name__ + if self.embedding_func + else "unknown", + "created_at": str(os.path.abspath(self.working_dir)), } # Ensure directory exists if not os.path.exists(self.working_dir): os.makedirs(self.working_dir) - + with open(meta_file, "w", encoding="utf-8") as f: json.dump(meta_data, f, indent=4) From 14b648e10f682150adb4c30b92165a046459b07c Mon Sep 17 00:00:00 2001 From: captainmirk <77876380+captainmirk@users.noreply.github.com> Date: Thu, 11 Dec 2025 15:58:11 +0000 Subject: [PATCH 3/3] fix: remove debug instrumentation and finalize logic fixes --- lightrag/lightrag.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index d8be53d8..b6bd7f4a 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -118,6 +118,7 @@ from dotenv import load_dotenv # use the .env that is inside the current folder # allows to use different .env file for each lightrag instance + # the OS environment variables take precedence over the .env file load_dotenv(dotenv_path=".env", override=False) @@ -535,6 +536,11 @@ class LightRAG: ) self.embedding_token_limit = embedding_max_token_size + # Capture embedding model name before decoration so we don't lose it to wrappers + self.embedding_model_name = ( + self.embedding_func.__class__.__name__ if self.embedding_func else "unknown" + ) + # --- CAPTURE EMBEDDING DIMENSION (NEW) --- self.embedding_dim = None if self.embedding_func and hasattr(self.embedding_func, "embedding_dim"): @@ -698,11 +704,10 @@ class LightRAG: # First run: Save the metadata meta_data = { "embedding_dim": self.embedding_dim, - "embedding_model_func": self.embedding_func.__class__.__name__ - if self.embedding_func - else "unknown", - "created_at": str(os.path.abspath(self.working_dir)), + "embedding_model_func": self.embedding_model_name, + "created_at": datetime.now(timezone.utc).isoformat(), } + # Ensure directory exists if not os.path.exists(self.working_dir): os.makedirs(self.working_dir)