From 64015548dfb7d6b55d436cf8663d6f9df419266f Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 19 Aug 2025 17:49:23 +0800
Subject: [PATCH 1/4] Refactor MD5 hash functions and consolidate Unicode error
 handling

---
 lightrag/utils.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index bbafd9f1..87ccbea1 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -272,19 +272,26 @@ def compute_args_hash(*args: Any) -> str:
     Returns:
         str: Hash string
     """
-    import hashlib
-
     # Convert all arguments to strings and join them
     args_str = "".join([str(arg) for arg in args])
 
     # Use 'replace' error handling to safely encode problematic Unicode characters
     # This replaces invalid characters with Unicode replacement character (U+FFFD)
     try:
-        return hashlib.md5(args_str.encode("utf-8")).hexdigest()
+        return md5(args_str.encode("utf-8")).hexdigest()
     except UnicodeEncodeError:
         # Handle surrogate characters and other encoding issues
         safe_bytes = args_str.encode("utf-8", errors="replace")
-        return hashlib.md5(safe_bytes).hexdigest()
+        return md5(safe_bytes).hexdigest()
+
+
+def compute_mdhash_id(content: str, prefix: str = "") -> str:
+    """
+    Compute a unique ID for a given content string.
+
+    The ID is a combination of the given prefix and the MD5 hash of the content string.
+    """
+    return prefix + compute_args_hash(content)
 
 
 def generate_cache_key(mode: str, cache_type: str, hash_value: str) -> str:
@@ -316,15 +323,6 @@ def parse_cache_key(cache_key: str) -> tuple[str, str, str] | None:
     return None
 
 
-def compute_mdhash_id(content: str, prefix: str = "") -> str:
-    """
-    Compute a unique ID for a given content string.
-
-    The ID is a combination of the given prefix and the MD5 hash of the content string.
-    """
-    return prefix + md5(content.encode()).hexdigest()
-
-
 # Custom exception class
 class QueueFullError(Exception):
     """Raised when the queue is full and the wait times out"""

From f9cf544805a10d597d0739b5eedd439a0ea8d0a6 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 19 Aug 2025 18:50:52 +0800
Subject: [PATCH 2/4] Add text sanitization to prevent UTF-8 encoding errors in
 LLM calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Remove surrogate characters
• Clean control characters
• Sanitize input and history messages
• Add comprehensive error handling
• Log sanitization activities
---
 lightrag/utils.py | 173 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 159 insertions(+), 14 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 87ccbea1..e830d944 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1400,11 +1400,13 @@ async def use_llm_func_with_cache(
     chunk_id: str | None = None,
     cache_keys_collector: list = None,
 ) -> str:
-    """Call LLM function with cache support
+    """Call LLM function with cache support and text sanitization
 
     If cache is available and enabled (determined by handle_cache based on mode),
     retrieve result from cache; otherwise call LLM function and save result to cache.
 
+    This function applies text sanitization to prevent UTF-8 encoding errors for all LLM providers.
+
     Args:
         input_text: Input text to send to LLM
         use_llm_func: LLM function with higher priority
@@ -1419,12 +1421,27 @@ async def use_llm_func_with_cache(
     Returns:
         LLM response text
     """
+    # Sanitize input text to prevent UTF-8 encoding errors for all LLM providers
+    safe_input_text = safe_encode_for_llm(input_text, f"llm_input_{cache_type}")
+
+    # Sanitize history messages if provided
+    safe_history_messages = None
+    if history_messages:
+        safe_history_messages = []
+        for i, msg in enumerate(history_messages):
+            safe_msg = msg.copy()
+            if "content" in safe_msg:
+                safe_msg["content"] = safe_encode_for_llm(
+                    safe_msg["content"], f"history_message_{i}"
+                )
+            safe_history_messages.append(safe_msg)
+
     if llm_response_cache:
-        if history_messages:
-            history = json.dumps(history_messages, ensure_ascii=False)
-            _prompt = history + "\n" + input_text
+        if safe_history_messages:
+            history = json.dumps(safe_history_messages, ensure_ascii=False)
+            _prompt = history + "\n" + safe_input_text
         else:
-            _prompt = input_text
+            _prompt = safe_input_text
 
         arg_hash = compute_args_hash(_prompt)
         # Generate cache key for this LLM call
@@ -1448,14 +1465,14 @@ async def use_llm_func_with_cache(
             return cached_return
         statistic_data["llm_call"] += 1
 
-        # Call LLM
+        # Call LLM with sanitized input
         kwargs = {}
-        if history_messages:
-            kwargs["history_messages"] = history_messages
+        if safe_history_messages:
+            kwargs["history_messages"] = safe_history_messages
         if max_tokens is not None:
             kwargs["max_tokens"] = max_tokens
 
-        res: str = await use_llm_func(input_text, **kwargs)
+        res: str = await use_llm_func(safe_input_text, **kwargs)
         res = remove_think_tags(res)
 
         if llm_response_cache.global_config.get("enable_llm_cache_for_entity_extract"):
@@ -1476,15 +1493,15 @@ async def use_llm_func_with_cache(
 
         return res
 
-    # When cache is disabled, directly call LLM
+    # When cache is disabled, directly call LLM with sanitized input
     kwargs = {}
-    if history_messages:
-        kwargs["history_messages"] = history_messages
+    if safe_history_messages:
+        kwargs["history_messages"] = safe_history_messages
     if max_tokens is not None:
         kwargs["max_tokens"] = max_tokens
 
-    logger.info(f"Call LLM function with query text length: {len(input_text)}")
-    res = await use_llm_func(input_text, **kwargs)
+    logger.info(f"Call LLM function with query text length: {len(safe_input_text)}")
+    res = await use_llm_func(safe_input_text, **kwargs)
     return remove_think_tags(res)
 
 
@@ -1570,6 +1587,134 @@ def clean_text(text: str) -> str:
     return text.strip().replace("\x00", "")
 
 
+def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
+    """Sanitize text to ensure safe UTF-8 encoding by removing or replacing problematic characters.
+
+    This function handles:
+    - Surrogate characters (the main cause of the encoding error)
+    - Other invalid Unicode sequences
+    - Control characters that might cause issues
+
+    Args:
+        text: Input text to sanitize
+        replacement_char: Character to use for replacing invalid sequences
+
+    Returns:
+        Sanitized text that can be safely encoded as UTF-8
+    """
+    if not isinstance(text, str):
+        return str(text)
+
+    if not text:
+        return text
+
+    try:
+        # First, try to encode/decode to catch any encoding issues early
+        text.encode("utf-8")
+
+        # Remove or replace surrogate characters (U+D800 to U+DFFF)
+        # These are the main cause of the encoding error
+        sanitized = ""
+        for char in text:
+            code_point = ord(char)
+            # Check for surrogate characters
+            if 0xD800 <= code_point <= 0xDFFF:
+                # Replace surrogate with replacement character
+                sanitized += replacement_char
+                continue
+            # Check for other problematic characters
+            elif code_point == 0xFFFE or code_point == 0xFFFF:
+                # These are non-characters in Unicode
+                sanitized += replacement_char
+                continue
+            else:
+                sanitized += char
+
+        # Additional cleanup: remove null bytes and other control characters
+        # that might cause issues (but preserve common whitespace)
+        sanitized = re.sub(
+            r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", replacement_char, sanitized
+        )
+
+        # Test final encoding to ensure it's safe
+        sanitized.encode("utf-8")
+
+        return sanitized
+
+    except UnicodeEncodeError as e:
+        logger.warning(
+            f"Text sanitization: UnicodeEncodeError encountered, applying aggressive cleaning: {str(e)[:100]}"
+        )
+
+        # Aggressive fallback: encode with error handling
+        try:
+            # Use 'replace' error handling to substitute problematic characters
+            safe_bytes = text.encode("utf-8", errors="replace")
+            sanitized = safe_bytes.decode("utf-8")
+
+            # Additional cleanup
+            sanitized = re.sub(
+                r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", replacement_char, sanitized
+            )
+
+            return sanitized
+
+        except Exception as fallback_error:
+            logger.error(
+                f"Text sanitization: Aggressive fallback failed: {str(fallback_error)}"
+            )
+            # Last resort: return a safe placeholder
+            return f"[TEXT_ENCODING_ERROR: {len(text)} characters]"
+
+    except Exception as e:
+        logger.error(f"Text sanitization: Unexpected error: {str(e)}")
+        # Return original text if no encoding issues detected
+        return text
+
+
+def safe_encode_for_llm(content: str, context: str = "unknown") -> str:
+    """Safely encode content for LLM API calls with comprehensive error handling.
+
+    This is the main function to use before sending text to LLM APIs to prevent
+    UTF-8 encoding errors.
+
+    Args:
+        content: Text content to encode safely
+        context: Context description for logging (e.g., "document_chunk", "prompt")
+
+    Returns:
+        Safely encoded text that won't cause UTF-8 encoding errors
+    """
+    if not content:
+        return content
+
+    original_length = len(content)
+
+    try:
+        # Apply text sanitization
+        sanitized = sanitize_text_for_encoding(content)
+
+        # Check if any changes were made
+        if len(sanitized) != original_length or sanitized != content:
+            # Count replaced characters (empty replacement chars)
+            replaced_count = original_length - len(sanitized)
+            logger.info(
+                f"Text encoding safety: Removed {replaced_count} problematic chars "
+                f"(original: {original_length} chars, sanitized: {len(sanitized)} chars)"
+            )
+
+        return sanitized
+
+    except Exception as e:
+        logger.error(
+            f"Text encoding safety: Failed to sanitize {context} content: {str(e)}"
+        )
+        # Return a safe fallback
+        return (
+            f"[CONTENT_SANITIZATION_ERROR: {original_length} characters from {context}]"
+        )
+
+
 def check_storage_env_vars(storage_name: str) -> None:
     """Check if all required environment variables for storage implementation exist
 

From 806081645f5120dfcf72dbdb014c629b23f09905 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 19 Aug 2025 19:20:01 +0800
Subject: [PATCH 3/4] Refactor text cleaning to use sanitize_text_for_encoding
 consistently
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Replace clean_text with sanitize_text
• Remove deprecated clean_text function
• Add whitespace trimming to sanitizer
• Improve UTF-8 encoding safety
• Consolidate text cleaning logic
---
 lightrag/lightrag.py | 12 ++++++------
 lightrag/utils.py    | 26 +++++++++++---------------
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index af700393..e2f9a3d7 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -85,7 +85,7 @@ from .utils import (
     lazy_external_import,
     priority_limit_async_func_call,
     get_content_summary,
-    clean_text,
+    sanitize_text_for_encoding,
     check_storage_env_vars,
     generate_track_id,
     logger,
@@ -908,8 +908,8 @@ class LightRAG:
         update_storage = False
         try:
             # Clean input texts
-            full_text = clean_text(full_text)
-            text_chunks = [clean_text(chunk) for chunk in text_chunks]
+            full_text = sanitize_text_for_encoding(full_text)
+            text_chunks = [sanitize_text_for_encoding(chunk) for chunk in text_chunks]
             file_path = ""
 
             # Process cleaned texts
@@ -1020,7 +1020,7 @@ class LightRAG:
             # Generate contents dict and remove duplicates in one pass
             unique_contents = {}
             for id_, doc, path in zip(ids, input, file_paths):
-                cleaned_content = clean_text(doc)
+                cleaned_content = sanitize_text_for_encoding(doc)
                 if cleaned_content not in unique_contents:
                     unique_contents[cleaned_content] = (id_, path)
 
@@ -1033,7 +1033,7 @@ class LightRAG:
             # Clean input text and remove duplicates in one pass
             unique_content_with_paths = {}
             for doc, path in zip(input, file_paths):
-                cleaned_content = clean_text(doc)
+                cleaned_content = sanitize_text_for_encoding(doc)
                 if cleaned_content not in unique_content_with_paths:
                     unique_content_with_paths[cleaned_content] = path
 
@@ -1817,7 +1817,7 @@ class LightRAG:
             all_chunks_data: dict[str, dict[str, str]] = {}
             chunk_to_source_map: dict[str, str] = {}
             for chunk_data in custom_kg.get("chunks", []):
-                chunk_content = clean_text(chunk_data["content"])
+                chunk_content = sanitize_text_for_encoding(chunk_data["content"])
                 source_id = chunk_data["source_id"]
                 file_path = chunk_data.get("file_path", "custom_kg")
                 tokens = len(self.tokenizer.encode(chunk_content))
diff --git a/lightrag/utils.py b/lightrag/utils.py
index e830d944..8dad9c22 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1575,18 +1575,6 @@ def normalize_extracted_info(name: str, is_entity=False) -> str:
     return name
 
 
-def clean_text(text: str) -> str:
-    """Clean text by removing null bytes (0x00) and whitespace
-
-    Args:
-        text: Input text to clean
-
-    Returns:
-        Cleaned text
-    """
-    return text.strip().replace("\x00", "")
-
-
 def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
     """Sanitize text to ensure safe UTF-8 encoding by removing or replacing problematic characters.
 
@@ -1594,6 +1582,7 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
     - Surrogate characters (the main cause of the encoding error)
     - Other invalid Unicode sequences
     - Control characters that might cause issues
+    - Whitespace trimming
 
     Args:
         text: Input text to sanitize
@@ -1609,7 +1598,14 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
         return text
 
     try:
-        # First, try to encode/decode to catch any encoding issues early
+        # First, strip whitespace
+        text = text.strip()
+
+        # Early return if text is empty after basic cleaning
+        if not text:
+            return text
+
+        # Try to encode/decode to catch any encoding issues early
         text.encode("utf-8")
 
         # Remove or replace surrogate characters (U+D800 to U+DFFF)
@@ -1630,8 +1626,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
             else:
                 sanitized += char
 
-        # Additional cleanup: remove null bytes and other control characters
-        # that might cause issues (but preserve common whitespace)
+        # Additional cleanup: remove null bytes  and other control characters that might cause issues
+        # (but preserve common whitespace like \t, \n, \r)
         sanitized = re.sub(
             r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", replacement_char, sanitized
         )

From ced3aef7cb83cbab19c5b2c93f30635578604331 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 19 Aug 2025 19:37:46 +0800
Subject: [PATCH 4/4] refactor: simplify text encoding by removing redundant
 safe_encode_for_llm

---
 lightrag/utils.py | 49 ++---------------------------------------------
 1 file changed, 2 insertions(+), 47 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 8dad9c22..979517b5 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1422,7 +1422,7 @@ async def use_llm_func_with_cache(
         LLM response text
     """
     # Sanitize input text to prevent UTF-8 encoding errors for all LLM providers
-    safe_input_text = safe_encode_for_llm(input_text, f"llm_input_{cache_type}")
+    safe_input_text = sanitize_text_for_encoding(input_text)
 
     # Sanitize history messages if provided
     safe_history_messages = None
@@ -1431,9 +1431,7 @@ async def use_llm_func_with_cache(
         for i, msg in enumerate(history_messages):
             safe_msg = msg.copy()
             if "content" in safe_msg:
-                safe_msg["content"] = safe_encode_for_llm(
-                    safe_msg["content"], f"history_message_{i}"
-                )
+                safe_msg["content"] = sanitize_text_for_encoding(safe_msg["content"])
             safe_history_messages.append(safe_msg)
 
     if llm_response_cache:
@@ -1668,49 +1666,6 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
         return text
 
 
-def safe_encode_for_llm(content: str, context: str = "unknown") -> str:
-    """Safely encode content for LLM API calls with comprehensive error handling.
-
-    This is the main function to use before sending text to LLM APIs to prevent
-    UTF-8 encoding errors.
-
-    Args:
-        content: Text content to encode safely
-        context: Context description for logging (e.g., "document_chunk", "prompt")
-
-    Returns:
-        Safely encoded text that won't cause UTF-8 encoding errors
-    """
-    if not content:
-        return content
-
-    original_length = len(content)
-
-    try:
-        # Apply text sanitization
-        sanitized = sanitize_text_for_encoding(content)
-
-        # Check if any changes were made
-        if len(sanitized) != original_length or sanitized != content:
-            # Count replaced characters (empty replacement chars)
-            replaced_count = original_length - len(sanitized)
-            logger.info(
-                f"Text encoding safety: Removed {replaced_count} problematic chars "
-                f"(original: {original_length} chars, sanitized: {len(sanitized)} chars)"
-            )
-
-        return sanitized
-
-    except Exception as e:
-        logger.error(
-            f"Text encoding safety: Failed to sanitize {context} content: {str(e)}"
-        )
-        # Return a safe fallback
-        return (
-            f"[CONTENT_SANITIZATION_ERROR: {original_length} characters from {context}]"
-        )
-
-
 def check_storage_env_vars(storage_name: str) -> None:
     """Check if all required environment variables for storage implementation exist