From 05852e1ab20007e746800961191de5f030cd46ad Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 14 Nov 2025 18:41:43 +0800
Subject: [PATCH 1/8] Add max_token_size parameter to embedding function
 decorators

- Add max_token_size=8192 to all embed funcs
- Move siliconcloud to deprecated folder
- Import wrap_embedding_func_with_attrs
- Update EmbeddingFunc docstring
- Fix langfuse import type annotation
---
 lightrag/llm/bedrock.py                       | 3 ++-
 lightrag/llm/{ => deprecated}/siliconcloud.py | 0
 lightrag/llm/gemini.py                        | 2 +-
 lightrag/llm/hf.py                            | 2 ++
 lightrag/llm/jina.py                          | 2 +-
 lightrag/llm/llama_index_impl.py              | 2 +-
 lightrag/llm/lollms.py                        | 5 +++++
 lightrag/llm/nvidia_openai.py                 | 2 +-
 lightrag/llm/ollama.py                        | 6 +++++-
 lightrag/llm/openai.py                        | 4 ++--
 lightrag/utils.py                             | 2 +-
 11 files changed, 21 insertions(+), 9 deletions(-)
 rename lightrag/llm/{ => deprecated}/siliconcloud.py (100%)

diff --git a/lightrag/llm/bedrock.py b/lightrag/llm/bedrock.py
index 16737341..ccfbb4f7 100644
--- a/lightrag/llm/bedrock.py
+++ b/lightrag/llm/bedrock.py
@@ -16,6 +16,7 @@ from tenacity import (
 )
 
 import sys
+from lightrag.utils import wrap_embedding_func_with_attrs
 
 if sys.version_info < (3, 9):
     from typing import AsyncIterator
@@ -253,7 +254,7 @@ async def bedrock_complete(
     return result
 
 
-# @wrap_embedding_func_with_attrs(embedding_dim=1024)
+@wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192)
 # @retry(
 #     stop=stop_after_attempt(3),
 #     wait=wait_exponential(multiplier=1, min=4, max=10),
diff --git a/lightrag/llm/siliconcloud.py b/lightrag/llm/deprecated/siliconcloud.py
similarity index 100%
rename from lightrag/llm/siliconcloud.py
rename to lightrag/llm/deprecated/siliconcloud.py
diff --git a/lightrag/llm/gemini.py b/lightrag/llm/gemini.py
index 983d6b9f..37ce7206 100644
--- a/lightrag/llm/gemini.py
+++ b/lightrag/llm/gemini.py
@@ -453,7 +453,7 @@ async def gemini_model_complete(
     )
 
 
-@wrap_embedding_func_with_attrs(embedding_dim=1536)
+@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=2048)
 @retry(
     stop=stop_after_attempt(3),
     wait=wait_exponential(multiplier=1, min=4, max=60),
diff --git a/lightrag/llm/hf.py b/lightrag/llm/hf.py
index c33b1c7f..447f95c3 100644
--- a/lightrag/llm/hf.py
+++ b/lightrag/llm/hf.py
@@ -26,6 +26,7 @@ from lightrag.exceptions import (
 )
 import torch
 import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
@@ -141,6 +142,7 @@ async def hf_model_complete(
     return result
 
 
+@wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192)
 async def hf_embed(texts: list[str], tokenizer, embed_model) -> np.ndarray:
     # Detect the appropriate device
     if torch.cuda.is_available():
diff --git a/lightrag/llm/jina.py b/lightrag/llm/jina.py
index 70de5995..f61faadd 100644
--- a/lightrag/llm/jina.py
+++ b/lightrag/llm/jina.py
@@ -58,7 +58,7 @@ async def fetch_data(url, headers, data):
             return data_list
 
 
-@wrap_embedding_func_with_attrs(embedding_dim=2048)
+@wrap_embedding_func_with_attrs(embedding_dim=2048, max_token_size=8192)
 @retry(
     stop=stop_after_attempt(3),
     wait=wait_exponential(multiplier=1, min=4, max=60),
diff --git a/lightrag/llm/llama_index_impl.py b/lightrag/llm/llama_index_impl.py
index 38ec7cd1..c44e6c7a 100644
--- a/lightrag/llm/llama_index_impl.py
+++ b/lightrag/llm/llama_index_impl.py
@@ -174,7 +174,7 @@ async def llama_index_complete(
     return result
 
 
-@wrap_embedding_func_with_attrs(embedding_dim=1536)
+@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192)
 @retry(
     stop=stop_after_attempt(3),
     wait=wait_exponential(multiplier=1, min=4, max=60),
diff --git a/lightrag/llm/lollms.py b/lightrag/llm/lollms.py
index 9274dbfc..2f2a1dbf 100644
--- a/lightrag/llm/lollms.py
+++ b/lightrag/llm/lollms.py
@@ -26,6 +26,10 @@ from lightrag.exceptions import (
 from typing import Union, List
 import numpy as np
 
+from lightrag.utils import (
+    wrap_embedding_func_with_attrs,
+)
+
 
 @retry(
     stop=stop_after_attempt(3),
@@ -134,6 +138,7 @@ async def lollms_model_complete(
     )
 
 
+@wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192)
 async def lollms_embed(
     texts: List[str], embed_model=None, base_url="http://localhost:9600", **kwargs
 ) -> np.ndarray:
diff --git a/lightrag/llm/nvidia_openai.py b/lightrag/llm/nvidia_openai.py
index 1cbab380..1ebaf3a6 100644
--- a/lightrag/llm/nvidia_openai.py
+++ b/lightrag/llm/nvidia_openai.py
@@ -33,7 +33,7 @@ from lightrag.utils import (
 import numpy as np
 
 
-@wrap_embedding_func_with_attrs(embedding_dim=2048)
+@wrap_embedding_func_with_attrs(embedding_dim=2048, max_token_size=8192)
 @retry(
     stop=stop_after_attempt(3),
     wait=wait_exponential(multiplier=1, min=4, max=60),
diff --git a/lightrag/llm/ollama.py b/lightrag/llm/ollama.py
index 670351bc..e35dc293 100644
--- a/lightrag/llm/ollama.py
+++ b/lightrag/llm/ollama.py
@@ -25,7 +25,10 @@ from lightrag.api import __api_version__
 
 import numpy as np
 from typing import Optional, Union
-from lightrag.utils import logger
+from lightrag.utils import (
+    wrap_embedding_func_with_attrs,
+    logger,
+)
 
 
 _OLLAMA_CLOUD_HOST = "https://ollama.com"
@@ -169,6 +172,7 @@ async def ollama_model_complete(
     )
 
 
+@wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192)
 async def ollama_embed(texts: list[str], embed_model, **kwargs) -> np.ndarray:
     api_key = kwargs.pop("api_key", None)
     if not api_key:
diff --git a/lightrag/llm/openai.py b/lightrag/llm/openai.py
index a2bbfa23..8c984e51 100644
--- a/lightrag/llm/openai.py
+++ b/lightrag/llm/openai.py
@@ -47,7 +47,7 @@ try:
 
     # Only enable Langfuse if both keys are configured
     if langfuse_public_key and langfuse_secret_key:
-        from langfuse.openai import AsyncOpenAI
+        from langfuse.openai import AsyncOpenAI  # type: ignore[import-untyped]
 
         LANGFUSE_ENABLED = True
         logger.info("Langfuse observability enabled for OpenAI client")
@@ -604,7 +604,7 @@ async def nvidia_openai_complete(
     return result
 
 
-@wrap_embedding_func_with_attrs(embedding_dim=1536)
+@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192)
 @retry(
     stop=stop_after_attempt(3),
     wait=wait_exponential(multiplier=1, min=4, max=60),
diff --git a/lightrag/utils.py b/lightrag/utils.py
index b78b7523..d653c1e3 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -355,7 +355,7 @@ class TaskState:
 class EmbeddingFunc:
     embedding_dim: int
     func: callable
-    max_token_size: int | None = None  # deprecated keep it for compatible only
+    max_token_size: int | None = None  # Token limit for the embedding model
     send_dimensions: bool = (
         False  # Control whether to send embedding_dim to the function
     )

From 680e36c6ebe6a2302914956845541f77c3ea855a Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 14 Nov 2025 18:51:41 +0800
Subject: [PATCH 2/8] Improve Bedrock error handling with retry logic and
 custom exceptions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Add specific exception types
• Implement proper retry mechanism
• Better error classification
• Enhanced logging and validation
• Enable embedding retry decorator
---
 lightrag/llm/bedrock.py | 256 ++++++++++++++++++++++++++++++++--------
 1 file changed, 205 insertions(+), 51 deletions(-)

diff --git a/lightrag/llm/bedrock.py b/lightrag/llm/bedrock.py
index ccfbb4f7..f6871422 100644
--- a/lightrag/llm/bedrock.py
+++ b/lightrag/llm/bedrock.py
@@ -1,6 +1,7 @@
 import copy
 import os
 import json
+import logging
 
 import pipmaster as pm  # Pipmaster for dynamic library install
 
@@ -24,21 +25,121 @@ else:
     from collections.abc import AsyncIterator
 from typing import Union
 
+# Import botocore exceptions for proper exception handling
+try:
+    from botocore.exceptions import (
+        ClientError,
+        ConnectionError as BotocoreConnectionError,
+        ReadTimeoutError,
+    )
+except ImportError:
+    # If botocore is not installed, define placeholders
+    ClientError = Exception
+    BotocoreConnectionError = Exception
+    ReadTimeoutError = Exception
+
 
 class BedrockError(Exception):
     """Generic error for issues related to Amazon Bedrock"""
 
 
+class BedrockRateLimitError(BedrockError):
+    """Error for rate limiting and throttling issues"""
+
+
+class BedrockConnectionError(BedrockError):
+    """Error for network and connection issues"""
+
+
+class BedrockTimeoutError(BedrockError):
+    """Error for timeout issues"""
+
+
 def _set_env_if_present(key: str, value):
     """Set environment variable only if a non-empty value is provided."""
     if value is not None and value != "":
         os.environ[key] = value
 
 
+def _handle_bedrock_exception(e: Exception, operation: str = "Bedrock API") -> None:
+    """Convert AWS Bedrock exceptions to appropriate custom exceptions.
+
+    Args:
+        e: The exception to handle
+        operation: Description of the operation for error messages
+
+    Raises:
+        BedrockRateLimitError: For rate limiting and throttling issues (retryable)
+        BedrockConnectionError: For network and server issues (retryable)
+        BedrockTimeoutError: For timeout issues (retryable)
+        BedrockError: For validation and other non-retryable errors
+    """
+    error_message = str(e)
+
+    # Handle botocore ClientError with specific error codes
+    if isinstance(e, ClientError):
+        error_code = e.response.get("Error", {}).get("Code", "")
+        error_msg = e.response.get("Error", {}).get("Message", error_message)
+
+        # Rate limiting and throttling errors (retryable)
+        if error_code in [
+            "ThrottlingException",
+            "ProvisionedThroughputExceededException",
+        ]:
+            logging.error(f"{operation} rate limit error: {error_msg}")
+            raise BedrockRateLimitError(f"Rate limit error: {error_msg}")
+
+        # Server errors (retryable)
+        elif error_code in ["ServiceUnavailableException", "InternalServerException"]:
+            logging.error(f"{operation} connection error: {error_msg}")
+            raise BedrockConnectionError(f"Service error: {error_msg}")
+
+        # Check for 5xx HTTP status codes (retryable)
+        elif e.response.get("ResponseMetadata", {}).get("HTTPStatusCode", 0) >= 500:
+            logging.error(f"{operation} server error: {error_msg}")
+            raise BedrockConnectionError(f"Server error: {error_msg}")
+
+        # Validation and other client errors (non-retryable)
+        else:
+            logging.error(f"{operation} client error: {error_msg}")
+            raise BedrockError(f"Client error: {error_msg}")
+
+    # Connection errors (retryable)
+    elif isinstance(e, BotocoreConnectionError):
+        logging.error(f"{operation} connection error: {error_message}")
+        raise BedrockConnectionError(f"Connection error: {error_message}")
+
+    # Timeout errors (retryable)
+    elif isinstance(e, (ReadTimeoutError, TimeoutError)):
+        logging.error(f"{operation} timeout error: {error_message}")
+        raise BedrockTimeoutError(f"Timeout error: {error_message}")
+
+    # Custom Bedrock errors (already properly typed)
+    elif isinstance(
+        e,
+        (
+            BedrockRateLimitError,
+            BedrockConnectionError,
+            BedrockTimeoutError,
+            BedrockError,
+        ),
+    ):
+        raise
+
+    # Unknown errors (non-retryable)
+    else:
+        logging.error(f"{operation} unexpected error: {error_message}")
+        raise BedrockError(f"Unexpected error: {error_message}")
+
+
 @retry(
     stop=stop_after_attempt(5),
-    wait=wait_exponential(multiplier=1, max=60),
-    retry=retry_if_exception_type((BedrockError)),
+    wait=wait_exponential(multiplier=1, min=4, max=60),
+    retry=(
+        retry_if_exception_type(BedrockRateLimitError)
+        | retry_if_exception_type(BedrockConnectionError)
+        | retry_if_exception_type(BedrockTimeoutError)
+    ),
 )
 async def bedrock_complete_if_cache(
     model,
@@ -159,9 +260,6 @@ async def bedrock_complete_if_cache(
                         break
 
             except Exception as e:
-                # Log the specific error for debugging
-                logging.error(f"Bedrock streaming error: {e}")
-
                 # Try to clean up resources if possible
                 if (
                     iteration_started
@@ -176,7 +274,8 @@ async def bedrock_complete_if_cache(
                             f"Failed to close Bedrock event stream: {close_error}"
                         )
 
-                raise BedrockError(f"Streaming error: {e}")
+                # Convert to appropriate exception type
+                _handle_bedrock_exception(e, "Bedrock streaming")
 
             finally:
                 # Clean up the event stream
@@ -232,10 +331,8 @@ async def bedrock_complete_if_cache(
             return content
 
         except Exception as e:
-            if isinstance(e, BedrockError):
-                raise
-            else:
-                raise BedrockError(f"Bedrock API error: {e}")
+            # Convert to appropriate exception type
+            _handle_bedrock_exception(e, "Bedrock converse")
 
 
 # Generic Bedrock completion function
@@ -255,11 +352,15 @@ async def bedrock_complete(
 
 
 @wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192)
-# @retry(
-#     stop=stop_after_attempt(3),
-#     wait=wait_exponential(multiplier=1, min=4, max=10),
-#     retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)),  # TODO: fix exceptions
-# )
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=4, max=60),
+    retry=(
+        retry_if_exception_type(BedrockRateLimitError)
+        | retry_if_exception_type(BedrockConnectionError)
+        | retry_if_exception_type(BedrockTimeoutError)
+    ),
+)
 async def bedrock_embed(
     texts: list[str],
     model: str = "amazon.titan-embed-text-v2:0",
@@ -282,48 +383,101 @@ async def bedrock_embed(
     async with session.client(
         "bedrock-runtime", region_name=region
     ) as bedrock_async_client:
-        if (model_provider := model.split(".")[0]) == "amazon":
-            embed_texts = []
-            for text in texts:
-                if "v2" in model:
+        try:
+            if (model_provider := model.split(".")[0]) == "amazon":
+                embed_texts = []
+                for text in texts:
+                    try:
+                        if "v2" in model:
+                            body = json.dumps(
+                                {
+                                    "inputText": text,
+                                    # 'dimensions': embedding_dim,
+                                    "embeddingTypes": ["float"],
+                                }
+                            )
+                        elif "v1" in model:
+                            body = json.dumps({"inputText": text})
+                        else:
+                            raise BedrockError(f"Model {model} is not supported!")
+
+                        response = await bedrock_async_client.invoke_model(
+                            modelId=model,
+                            body=body,
+                            accept="application/json",
+                            contentType="application/json",
+                        )
+
+                        response_body = await response.get("body").json()
+
+                        # Validate response structure
+                        if not response_body or "embedding" not in response_body:
+                            raise BedrockError(
+                                f"Invalid embedding response structure for text: {text[:50]}..."
+                            )
+
+                        embedding = response_body["embedding"]
+                        if not embedding:
+                            raise BedrockError(
+                                f"Received empty embedding for text: {text[:50]}..."
+                            )
+
+                        embed_texts.append(embedding)
+
+                    except Exception as e:
+                        # Convert to appropriate exception type
+                        _handle_bedrock_exception(
+                            e, "Bedrock embedding (amazon, text chunk)"
+                        )
+
+            elif model_provider == "cohere":
+                try:
                     body = json.dumps(
                         {
-                            "inputText": text,
-                            # 'dimensions': embedding_dim,
-                            "embeddingTypes": ["float"],
+                            "texts": texts,
+                            "input_type": "search_document",
+                            "truncate": "NONE",
                         }
                     )
-                elif "v1" in model:
-                    body = json.dumps({"inputText": text})
-                else:
-                    raise ValueError(f"Model {model} is not supported!")
 
-                response = await bedrock_async_client.invoke_model(
-                    modelId=model,
-                    body=body,
-                    accept="application/json",
-                    contentType="application/json",
+                    response = await bedrock_async_client.invoke_model(
+                        model=model,
+                        body=body,
+                        accept="application/json",
+                        contentType="application/json",
+                    )
+
+                    response_body = json.loads(response.get("body").read())
+
+                    # Validate response structure
+                    if not response_body or "embeddings" not in response_body:
+                        raise BedrockError(
+                            "Invalid embedding response structure from Cohere"
+                        )
+
+                    embeddings = response_body["embeddings"]
+                    if not embeddings or len(embeddings) != len(texts):
+                        raise BedrockError(
+                            f"Invalid embeddings count: expected {len(texts)}, got {len(embeddings) if embeddings else 0}"
+                        )
+
+                    embed_texts = embeddings
+
+                except Exception as e:
+                    # Convert to appropriate exception type
+                    _handle_bedrock_exception(e, "Bedrock embedding (cohere)")
+
+            else:
+                raise BedrockError(
+                    f"Model provider '{model_provider}' is not supported!"
                 )
 
-                response_body = await response.get("body").json()
+            # Final validation
+            if not embed_texts:
+                raise BedrockError("No embeddings generated")
 
-                embed_texts.append(response_body["embedding"])
-        elif model_provider == "cohere":
-            body = json.dumps(
-                {"texts": texts, "input_type": "search_document", "truncate": "NONE"}
-            )
+            return np.array(embed_texts)
 
-            response = await bedrock_async_client.invoke_model(
-                model=model,
-                body=body,
-                accept="application/json",
-                contentType="application/json",
-            )
-
-            response_body = json.loads(response.get("body").read())
-
-            embed_texts = response_body["embeddings"]
-        else:
-            raise ValueError(f"Model provider '{model_provider}' is not supported!")
-
-        return np.array(embed_texts)
+        except Exception as e:
+            # Convert to appropriate exception type
+            _handle_bedrock_exception(e, "Bedrock embedding")

From ab4d7ac2b0bb4821f788cf321fcb6b2d68dba580 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 14 Nov 2025 19:28:36 +0800
Subject: [PATCH 3/8] Add configurable embedding token limit with validation

- Add EMBEDDING_TOKEN_LIMIT env var
- Set max_token_size on embedding func
- Add token limit property to LightRAG
- Validate summary length vs limit
- Log warning when limit exceeded
---
 lightrag/api/config.py          |  5 +++++
 lightrag/api/lightrag_server.py |  5 +++++
 lightrag/lightrag.py            |  7 +++++++
 lightrag/operate.py             | 14 ++++++++++++++
 4 files changed, 31 insertions(+)

diff --git a/lightrag/api/config.py b/lightrag/api/config.py
index 95ab9f70..4f59d3c1 100644
--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@@ -445,6 +445,11 @@ def parse_args() -> argparse.Namespace:
         "EMBEDDING_BATCH_NUM", DEFAULT_EMBEDDING_BATCH_NUM, int
     )
 
+    # Embedding token limit configuration
+    args.embedding_token_limit = get_env_value(
+        "EMBEDDING_TOKEN_LIMIT", None, int, special_none=True
+    )
+
     ollama_server_infos.LIGHTRAG_NAME = args.simulated_model_name
     ollama_server_infos.LIGHTRAG_TAG = args.simulated_model_tag
 
diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index e5a86b3e..adbc5f28 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -784,6 +784,11 @@ def create_app(args):
         send_dimensions=send_dimensions,
     )
 
+    # Set max_token_size if EMBEDDING_TOKEN_LIMIT is provided
+    if args.embedding_token_limit is not None:
+        embedding_func.max_token_size = args.embedding_token_limit
+        logger.info(f"Set embedding max_token_size to {args.embedding_token_limit}")
+
     # Configure rerank function based on args.rerank_bindingparameter
     rerank_model_func = None
     if args.rerank_binding != "null":
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index e540e3b7..72a4dc6d 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -276,6 +276,13 @@ class LightRAG:
     embedding_func: EmbeddingFunc | None = field(default=None)
     """Function for computing text embeddings. Must be set before use."""
 
+    @property
+    def embedding_token_limit(self) -> int | None:
+        """Get the token limit for embedding model from embedding_func."""
+        if self.embedding_func and hasattr(self.embedding_func, "max_token_size"):
+            return self.embedding_func.max_token_size
+        return None
+
     embedding_batch_num: int = field(default=int(os.getenv("EMBEDDING_BATCH_NUM", 10)))
     """Batch size for embedding computations."""
 
diff --git a/lightrag/operate.py b/lightrag/operate.py
index ae2be49e..858553b1 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -345,6 +345,20 @@ async def _summarize_descriptions(
         llm_response_cache=llm_response_cache,
         cache_type="summary",
     )
+
+    # Check summary token length against embedding limit
+    embedding_token_limit = global_config.get("embedding_token_limit")
+    if embedding_token_limit is not None and summary:
+        tokenizer = global_config["tokenizer"]
+        summary_token_count = len(tokenizer.encode(summary))
+        threshold = int(embedding_token_limit * 0.9)
+
+        if summary_token_count > threshold:
+            logger.warning(
+                f"Summary tokens ({summary_token_count}) exceeds 90% of embedding limit "
+                f"({embedding_token_limit}) for {description_type}: {description_name}"
+            )
+
     return summary
 
 

From 39b49e92ffa1fe62d8a00932010d5fb940dec030 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 14 Nov 2025 20:58:41 +0800
Subject: [PATCH 4/8] Convert embedding_token_limit from property to field with
 __post_init__
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Remove property decorator
• Add field with init=False
• Set value in __post_init__ method
• embedding_token_limit is now in config dictionary
---
 lightrag/lightrag.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 72a4dc6d..1f0aa9d9 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -276,12 +276,8 @@ class LightRAG:
     embedding_func: EmbeddingFunc | None = field(default=None)
     """Function for computing text embeddings. Must be set before use."""
 
-    @property
-    def embedding_token_limit(self) -> int | None:
-        """Get the token limit for embedding model from embedding_func."""
-        if self.embedding_func and hasattr(self.embedding_func, "max_token_size"):
-            return self.embedding_func.max_token_size
-        return None
+    embedding_token_limit: int | None = field(default=None, init=False)
+    """Token limit for embedding model. Set automatically from embedding_func.max_token_size in __post_init__."""
 
     embedding_batch_num: int = field(default=int(os.getenv("EMBEDDING_BATCH_NUM", 10)))
     """Batch size for embedding computations."""
@@ -532,6 +528,12 @@ class LightRAG:
             queue_name="Embedding func",
         )(self.embedding_func)
 
+        # Initialize embedding_token_limit from embedding_func
+        if self.embedding_func and hasattr(self.embedding_func, "max_token_size"):
+            self.embedding_token_limit = self.embedding_func.max_token_size
+        else:
+            self.embedding_token_limit = None
+
         # Initialize all storages
         self.key_string_value_json_storage_cls: type[BaseKVStorage] = (
             self._get_storage_class(self.kv_storage)

From 963a0a5db154a6cbcf62c060ae6a4549d5079fe2 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 14 Nov 2025 22:29:08 +0800
Subject: [PATCH 5/8] Refactor embedding function creation with proper
 attribute inheritance

- Extract max_token_size from providers
- Avoid double-wrapping EmbeddingFunc
- Improve configuration priority logic
- Add comprehensive debug logging
- Return complete EmbeddingFunc instance
---
 lightrag/api/lightrag_server.py | 173 +++++++++++++++++++++++++-------
 1 file changed, 139 insertions(+), 34 deletions(-)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index adbc5f28..f4b0bd24 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -618,33 +618,102 @@ def create_app(args):
 
     def create_optimized_embedding_function(
         config_cache: LLMConfigCache, binding, model, host, api_key, args
-    ):
+    ) -> EmbeddingFunc:
         """
-        Create optimized embedding function with pre-processed configuration for applicable bindings.
-        Uses lazy imports for all bindings and avoids repeated configuration parsing.
+        Create optimized embedding function and return an EmbeddingFunc instance
+        with proper max_token_size inheritance from provider defaults.
+
+        This function:
+        1. Imports the provider embedding function
+        2. Extracts max_token_size and embedding_dim from provider if it's an EmbeddingFunc
+        3. Creates an optimized wrapper that calls the underlying function directly (avoiding double-wrapping)
+        4. Returns a properly configured EmbeddingFunc instance
         """
 
+        # Step 1: Import provider function and extract default attributes
+        provider_func = None
+        default_max_token_size = None
+        default_embedding_dim = args.embedding_dim  # Use config as default
+
+        try:
+            if binding == "openai":
+                from lightrag.llm.openai import openai_embed
+
+                provider_func = openai_embed
+            elif binding == "ollama":
+                from lightrag.llm.ollama import ollama_embed
+
+                provider_func = ollama_embed
+            elif binding == "gemini":
+                from lightrag.llm.gemini import gemini_embed
+
+                provider_func = gemini_embed
+            elif binding == "jina":
+                from lightrag.llm.jina import jina_embed
+
+                provider_func = jina_embed
+            elif binding == "azure_openai":
+                from lightrag.llm.azure_openai import azure_openai_embed
+
+                provider_func = azure_openai_embed
+            elif binding == "aws_bedrock":
+                from lightrag.llm.bedrock import bedrock_embed
+
+                provider_func = bedrock_embed
+            elif binding == "lollms":
+                from lightrag.llm.lollms import lollms_embed
+
+                provider_func = lollms_embed
+
+            # Extract attributes if provider is an EmbeddingFunc
+            if provider_func and isinstance(provider_func, EmbeddingFunc):
+                default_max_token_size = provider_func.max_token_size
+                default_embedding_dim = provider_func.embedding_dim
+                logger.debug(
+                    f"Extracted from {binding} provider: "
+                    f"max_token_size={default_max_token_size}, "
+                    f"embedding_dim={default_embedding_dim}"
+                )
+        except ImportError as e:
+            logger.warning(f"Could not import provider function for {binding}: {e}")
+
+        # Step 2: Apply priority (environment variable > provider default)
+        final_max_token_size = args.embedding_token_limit or default_max_token_size
+
+        # Step 3: Create optimized embedding function (calls underlying function directly)
         async def optimized_embedding_function(texts, embedding_dim=None):
             try:
                 if binding == "lollms":
                     from lightrag.llm.lollms import lollms_embed
 
-                    return await lollms_embed(
+                    # Get real function, skip EmbeddingFunc wrapper if present
+                    actual_func = (
+                        lollms_embed.func
+                        if isinstance(lollms_embed, EmbeddingFunc)
+                        else lollms_embed
+                    )
+                    return await actual_func(
                         texts, embed_model=model, host=host, api_key=api_key
                     )
                 elif binding == "ollama":
                     from lightrag.llm.ollama import ollama_embed
 
-                    # Use pre-processed configuration if available, otherwise fallback to dynamic parsing
+                    # Get real function, skip EmbeddingFunc wrapper if present
+                    actual_func = (
+                        ollama_embed.func
+                        if isinstance(ollama_embed, EmbeddingFunc)
+                        else ollama_embed
+                    )
+
+                    # Use pre-processed configuration if available
                     if config_cache.ollama_embedding_options is not None:
                         ollama_options = config_cache.ollama_embedding_options
                     else:
-                        # Fallback for cases where config cache wasn't initialized properly
                         from lightrag.llm.binding_options import OllamaEmbeddingOptions
 
                         ollama_options = OllamaEmbeddingOptions.options_dict(args)
 
-                    return await ollama_embed(
+                    return await actual_func(
                         texts,
                         embed_model=model,
                         host=host,
@@ -654,15 +723,30 @@ def create_app(args):
                 elif binding == "azure_openai":
                     from lightrag.llm.azure_openai import azure_openai_embed
 
-                    return await azure_openai_embed(texts, model=model, api_key=api_key)
+                    actual_func = (
+                        azure_openai_embed.func
+                        if isinstance(azure_openai_embed, EmbeddingFunc)
+                        else azure_openai_embed
+                    )
+                    return await actual_func(texts, model=model, api_key=api_key)
                 elif binding == "aws_bedrock":
                     from lightrag.llm.bedrock import bedrock_embed
 
-                    return await bedrock_embed(texts, model=model)
+                    actual_func = (
+                        bedrock_embed.func
+                        if isinstance(bedrock_embed, EmbeddingFunc)
+                        else bedrock_embed
+                    )
+                    return await actual_func(texts, model=model)
                 elif binding == "jina":
                     from lightrag.llm.jina import jina_embed
 
-                    return await jina_embed(
+                    actual_func = (
+                        jina_embed.func
+                        if isinstance(jina_embed, EmbeddingFunc)
+                        else jina_embed
+                    )
+                    return await actual_func(
                         texts,
                         embedding_dim=embedding_dim,
                         base_url=host,
@@ -671,16 +755,21 @@ def create_app(args):
                 elif binding == "gemini":
                     from lightrag.llm.gemini import gemini_embed
 
-                    # Use pre-processed configuration if available, otherwise fallback to dynamic parsing
+                    actual_func = (
+                        gemini_embed.func
+                        if isinstance(gemini_embed, EmbeddingFunc)
+                        else gemini_embed
+                    )
+
+                    # Use pre-processed configuration if available
                     if config_cache.gemini_embedding_options is not None:
                         gemini_options = config_cache.gemini_embedding_options
                     else:
-                        # Fallback for cases where config cache wasn't initialized properly
                         from lightrag.llm.binding_options import GeminiEmbeddingOptions
 
                         gemini_options = GeminiEmbeddingOptions.options_dict(args)
 
-                    return await gemini_embed(
+                    return await actual_func(
                         texts,
                         model=model,
                         base_url=host,
@@ -691,7 +780,12 @@ def create_app(args):
                 else:  # openai and compatible
                     from lightrag.llm.openai import openai_embed
 
-                    return await openai_embed(
+                    actual_func = (
+                        openai_embed.func
+                        if isinstance(openai_embed, EmbeddingFunc)
+                        else openai_embed
+                    )
+                    return await actual_func(
                         texts,
                         model=model,
                         base_url=host,
@@ -701,7 +795,15 @@ def create_app(args):
             except ImportError as e:
                 raise Exception(f"Failed to import {binding} embedding: {e}")
 
-        return optimized_embedding_function
+        # Step 4: Wrap in EmbeddingFunc and return
+        embedding_func_instance = EmbeddingFunc(
+            embedding_dim=default_embedding_dim,
+            func=optimized_embedding_function,
+            max_token_size=final_max_token_size,
+            send_dimensions=False,  # Will be set later based on binding requirements
+        )
+
+        return embedding_func_instance
 
     llm_timeout = get_env_value("LLM_TIMEOUT", DEFAULT_LLM_TIMEOUT, int)
     embedding_timeout = get_env_value(
@@ -735,25 +837,24 @@ def create_app(args):
             **kwargs,
         )
 
-    # Create embedding function with optimized configuration
+    # Create embedding function with optimized configuration and max_token_size inheritance
     import inspect
 
-    # Create the optimized embedding function
-    optimized_embedding_func = create_optimized_embedding_function(
+    # Create the EmbeddingFunc instance (now returns complete EmbeddingFunc with max_token_size)
+    embedding_func = create_optimized_embedding_function(
         config_cache=config_cache,
         binding=args.embedding_binding,
         model=args.embedding_model,
         host=args.embedding_binding_host,
         api_key=args.embedding_binding_api_key,
-        args=args,  # Pass args object for fallback option generation
+        args=args,
     )
 
     # Get embedding_send_dim from centralized configuration
     embedding_send_dim = args.embedding_send_dim
 
-    # Check if the function signature has embedding_dim parameter
-    # Note: Since optimized_embedding_func is an async function, inspect its signature
-    sig = inspect.signature(optimized_embedding_func)
+    # Check if the underlying function signature has embedding_dim parameter
+    sig = inspect.signature(embedding_func.func)
     has_embedding_dim_param = "embedding_dim" in sig.parameters
 
     # Determine send_dimensions value based on binding type
@@ -771,23 +872,27 @@ def create_app(args):
         else:
             dimension_control = "by not hasparam"
 
+    # Set send_dimensions on the EmbeddingFunc instance
+    embedding_func.send_dimensions = send_dimensions
+
     logger.info(
         f"Send embedding dimension: {send_dimensions} {dimension_control} "
-        f"(dimensions={args.embedding_dim}, has_param={has_embedding_dim_param}, "
+        f"(dimensions={embedding_func.embedding_dim}, has_param={has_embedding_dim_param}, "
         f"binding={args.embedding_binding})"
     )
 
-    # Create EmbeddingFunc with send_dimensions attribute
-    embedding_func = EmbeddingFunc(
-        embedding_dim=args.embedding_dim,
-        func=optimized_embedding_func,
-        send_dimensions=send_dimensions,
-    )
-
-    # Set max_token_size if EMBEDDING_TOKEN_LIMIT is provided
-    if args.embedding_token_limit is not None:
-        embedding_func.max_token_size = args.embedding_token_limit
-        logger.info(f"Set embedding max_token_size to {args.embedding_token_limit}")
+    # Log max_token_size source
+    if embedding_func.max_token_size:
+        source = (
+            "env variable"
+            if args.embedding_token_limit
+            else f"{args.embedding_binding} provider default"
+        )
+        logger.info(
+            f"Embedding max_token_size: {embedding_func.max_token_size} (from {source})"
+        )
+    else:
+        logger.info("Embedding max_token_size: not set (90% token warning disabled)")
 
     # Configure rerank function based on args.rerank_bindingparameter
     rerank_model_func = None

From de4412dd4080380d9da870d271faffffebba8760 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 14 Nov 2025 22:56:03 +0800
Subject: [PATCH 6/8] Fix embedding token limit initialization order

* Capture max_token_size before decorator
* Apply wrapper after capturing attribute
* Prevent decorator from stripping dataclass
* Ensure token limit is properly set
---
 lightrag/lightrag.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 1f0aa9d9..c53c98ac 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -522,18 +522,22 @@ class LightRAG:
         logger.debug(f"LightRAG init with param:\n  {_print_config}\n")
 
         # Init Embedding
+        # Step 1: Capture max_token_size before applying decorator (decorator strips dataclass attributes)
+        embedding_max_token_size = None
+        if self.embedding_func and hasattr(self.embedding_func, "max_token_size"):
+            embedding_max_token_size = self.embedding_func.max_token_size
+            logger.debug(
+                f"Captured embedding max_token_size: {embedding_max_token_size}"
+            )
+        self.embedding_token_limit = embedding_max_token_size
+
+        # Step 2: Apply priority wrapper decorator
         self.embedding_func = priority_limit_async_func_call(
             self.embedding_func_max_async,
             llm_timeout=self.default_embedding_timeout,
             queue_name="Embedding func",
         )(self.embedding_func)
 
-        # Initialize embedding_token_limit from embedding_func
-        if self.embedding_func and hasattr(self.embedding_func, "max_token_size"):
-            self.embedding_token_limit = self.embedding_func.max_token_size
-        else:
-            self.embedding_token_limit = None
-
         # Initialize all storages
         self.key_string_value_json_storage_cls: type[BaseKVStorage] = (
             self._get_storage_class(self.kv_storage)

From 5dec4deac7ca23d818d1a36ab1497f5eef08f3d2 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 14 Nov 2025 23:22:44 +0800
Subject: [PATCH 7/8] Improve embedding config priority and add debug logging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Fix embedding_dim priority logic
• Add final config logging
---
 lightrag/api/lightrag_server.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index f4b0bd24..41a07f7f 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -632,8 +632,8 @@ def create_app(args):
 
         # Step 1: Import provider function and extract default attributes
         provider_func = None
-        default_max_token_size = None
-        default_embedding_dim = args.embedding_dim  # Use config as default
+        provider_max_token_size = None
+        provider_embedding_dim = None
 
         try:
             if binding == "openai":
@@ -667,18 +667,24 @@ def create_app(args):
 
             # Extract attributes if provider is an EmbeddingFunc
             if provider_func and isinstance(provider_func, EmbeddingFunc):
-                default_max_token_size = provider_func.max_token_size
-                default_embedding_dim = provider_func.embedding_dim
+                provider_max_token_size = provider_func.max_token_size
+                provider_embedding_dim = provider_func.embedding_dim
                 logger.debug(
                     f"Extracted from {binding} provider: "
-                    f"max_token_size={default_max_token_size}, "
-                    f"embedding_dim={default_embedding_dim}"
+                    f"max_token_size={provider_max_token_size}, "
+                    f"embedding_dim={provider_embedding_dim}"
                 )
         except ImportError as e:
             logger.warning(f"Could not import provider function for {binding}: {e}")
 
-        # Step 2: Apply priority (environment variable > provider default)
-        final_max_token_size = args.embedding_token_limit or default_max_token_size
+        # Step 2: Apply priority (user config > provider default)
+        # For max_token_size: explicit env var > provider default > None
+        final_max_token_size = args.embedding_token_limit or provider_max_token_size
+        # For embedding_dim: user config (always has value) takes priority
+        # Only use provider default if user config is explicitly None (which shouldn't happen)
+        final_embedding_dim = (
+            args.embedding_dim if args.embedding_dim else provider_embedding_dim
+        )
 
         # Step 3: Create optimized embedding function (calls underlying function directly)
         async def optimized_embedding_function(texts, embedding_dim=None):
@@ -797,12 +803,18 @@ def create_app(args):
 
         # Step 4: Wrap in EmbeddingFunc and return
         embedding_func_instance = EmbeddingFunc(
-            embedding_dim=default_embedding_dim,
+            embedding_dim=final_embedding_dim,
             func=optimized_embedding_function,
             max_token_size=final_max_token_size,
             send_dimensions=False,  # Will be set later based on binding requirements
         )
 
+        # Log final embedding configuration
+        logger.info(
+            f"Embedding config: binding={binding} model={model} "
+            f"embedding_dim={final_embedding_dim} max_token_size={final_max_token_size}"
+        )
+
         return embedding_func_instance
 
     llm_timeout = get_env_value("LLM_TIMEOUT", DEFAULT_LLM_TIMEOUT, int)

From 8722103550648f7d67fa2f6cab9712ec56e49b96 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Sat, 15 Nov 2025 01:25:56 +0800
Subject: [PATCH 8/8] Update env.example
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Comment out Ollama config
• Set OpenAI as active default
• Add EMBEDDING_TOKEN_LIMIT option
• Add Gemini embedding configuration
---
 env.example | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/env.example b/env.example
index a95ff9bf..60aaf0ed 100644
--- a/env.example
+++ b/env.example
@@ -255,21 +255,23 @@ OLLAMA_LLM_NUM_CTX=32768
 ### For OpenAI: Set to 'true' to enable dynamic dimension adjustment
 ### For OpenAI: Set to 'false' (default) to disable sending dimension parameter
 ### Note: Automatically ignored for backends that don't support dimension parameter (e.g., Ollama)
-# EMBEDDING_SEND_DIM=false
 
-EMBEDDING_BINDING=ollama
-EMBEDDING_MODEL=bge-m3:latest
-EMBEDDING_DIM=1024
-EMBEDDING_BINDING_API_KEY=your_api_key
-# If LightRAG deployed in Docker uses host.docker.internal instead of localhost
-EMBEDDING_BINDING_HOST=http://localhost:11434
-
-### OpenAI compatible (VoyageAI embedding openai compatible)
-# EMBEDDING_BINDING=openai
-# EMBEDDING_MODEL=text-embedding-3-large
-# EMBEDDING_DIM=3072
-# EMBEDDING_BINDING_HOST=https://api.openai.com/v1
+# Ollama embedding
+# EMBEDDING_BINDING=ollama
+# EMBEDDING_MODEL=bge-m3:latest
+# EMBEDDING_DIM=1024
 # EMBEDDING_BINDING_API_KEY=your_api_key
+### If LightRAG deployed in Docker uses host.docker.internal instead of localhost
+# EMBEDDING_BINDING_HOST=http://localhost:11434
+
+### OpenAI compatible embedding
+EMBEDDING_BINDING=openai
+EMBEDDING_MODEL=text-embedding-3-large
+EMBEDDING_DIM=3072
+EMBEDDING_SEND_DIM=false
+EMBEDDING_TOKEN_LIMIT=8192
+EMBEDDING_BINDING_HOST=https://api.openai.com/v1
+EMBEDDING_BINDING_API_KEY=your_api_key
 
 ### Optional for Azure
 # AZURE_EMBEDDING_DEPLOYMENT=text-embedding-3-large
@@ -277,6 +279,16 @@ EMBEDDING_BINDING_HOST=http://localhost:11434
 # AZURE_EMBEDDING_ENDPOINT=your_endpoint
 # AZURE_EMBEDDING_API_KEY=your_api_key
 
+### Gemini embedding
+# EMBEDDING_BINDING=gemini
+# EMBEDDING_MODEL=gemini-embedding-001
+# EMBEDDING_DIM=1536
+# EMBEDDING_TOKEN_LIMIT=2048
+# EMBEDDING_BINDING_HOST=https://generativelanguage.googleapis.com
+# EMBEDDING_BINDING_API_KEY=your_api_key
+### Gemini embedding requires sending dimension to server
+# EMBEDDING_SEND_DIM=true
+
 ### Jina AI Embedding
 # EMBEDDING_BINDING=jina
 # EMBEDDING_BINDING_HOST=https://api.jina.ai/v1/embeddings