From 806081645f5120dfcf72dbdb014c629b23f09905 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 19 Aug 2025 19:20:01 +0800
Subject: [PATCH] Refactor text cleaning to use sanitize_text_for_encoding
 consistently
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Replace clean_text with sanitize_text
• Remove deprecated clean_text function
• Add whitespace trimming to sanitizer
• Improve UTF-8 encoding safety
• Consolidate text cleaning logic
---
 lightrag/lightrag.py | 12 ++++++------
 lightrag/utils.py    | 26 +++++++++++---------------
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index af700393..e2f9a3d7 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -85,7 +85,7 @@ from .utils import (
     lazy_external_import,
     priority_limit_async_func_call,
     get_content_summary,
-    clean_text,
+    sanitize_text_for_encoding,
     check_storage_env_vars,
     generate_track_id,
     logger,
@@ -908,8 +908,8 @@ class LightRAG:
         update_storage = False
         try:
             # Clean input texts
-            full_text = clean_text(full_text)
-            text_chunks = [clean_text(chunk) for chunk in text_chunks]
+            full_text = sanitize_text_for_encoding(full_text)
+            text_chunks = [sanitize_text_for_encoding(chunk) for chunk in text_chunks]
             file_path = ""
 
             # Process cleaned texts
@@ -1020,7 +1020,7 @@ class LightRAG:
             # Generate contents dict and remove duplicates in one pass
             unique_contents = {}
             for id_, doc, path in zip(ids, input, file_paths):
-                cleaned_content = clean_text(doc)
+                cleaned_content = sanitize_text_for_encoding(doc)
                 if cleaned_content not in unique_contents:
                     unique_contents[cleaned_content] = (id_, path)
 
@@ -1033,7 +1033,7 @@ class LightRAG:
             # Clean input text and remove duplicates in one pass
             unique_content_with_paths = {}
             for doc, path in zip(input, file_paths):
-                cleaned_content = clean_text(doc)
+                cleaned_content = sanitize_text_for_encoding(doc)
                 if cleaned_content not in unique_content_with_paths:
                     unique_content_with_paths[cleaned_content] = path
 
@@ -1817,7 +1817,7 @@ class LightRAG:
             all_chunks_data: dict[str, dict[str, str]] = {}
             chunk_to_source_map: dict[str, str] = {}
             for chunk_data in custom_kg.get("chunks", []):
-                chunk_content = clean_text(chunk_data["content"])
+                chunk_content = sanitize_text_for_encoding(chunk_data["content"])
                 source_id = chunk_data["source_id"]
                 file_path = chunk_data.get("file_path", "custom_kg")
                 tokens = len(self.tokenizer.encode(chunk_content))
diff --git a/lightrag/utils.py b/lightrag/utils.py
index e830d944..8dad9c22 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1575,18 +1575,6 @@ def normalize_extracted_info(name: str, is_entity=False) -> str:
     return name
 
 
-def clean_text(text: str) -> str:
-    """Clean text by removing null bytes (0x00) and whitespace
-
-    Args:
-        text: Input text to clean
-
-    Returns:
-        Cleaned text
-    """
-    return text.strip().replace("\x00", "")
-
-
 def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
     """Sanitize text to ensure safe UTF-8 encoding by removing or replacing problematic characters.
 
@@ -1594,6 +1582,7 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
     - Surrogate characters (the main cause of the encoding error)
     - Other invalid Unicode sequences
     - Control characters that might cause issues
+    - Whitespace trimming
 
     Args:
         text: Input text to sanitize
@@ -1609,7 +1598,14 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
         return text
 
     try:
-        # First, try to encode/decode to catch any encoding issues early
+        # First, strip whitespace
+        text = text.strip()
+
+        # Early return if text is empty after basic cleaning
+        if not text:
+            return text
+
+        # Try to encode/decode to catch any encoding issues early
         text.encode("utf-8")
 
         # Remove or replace surrogate characters (U+D800 to U+DFFF)
@@ -1630,8 +1626,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
             else:
                 sanitized += char
 
-        # Additional cleanup: remove null bytes and other control characters
-        # that might cause issues (but preserve common whitespace)
+        # Additional cleanup: remove null bytes  and other control characters that might cause issues
+        # (but preserve common whitespace like \t, \n, \r)
         sanitized = re.sub(
             r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", replacement_char, sanitized
         )