From 806081645f5120dfcf72dbdb014c629b23f09905 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 19 Aug 2025 19:20:01 +0800 Subject: [PATCH] Refactor text cleaning to use sanitize_text_for_encoding consistently MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Replace clean_text with sanitize_text • Remove deprecated clean_text function • Add whitespace trimming to sanitizer • Improve UTF-8 encoding safety • Consolidate text cleaning logic --- lightrag/lightrag.py | 12 ++++++------ lightrag/utils.py | 26 +++++++++++--------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index af700393..e2f9a3d7 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -85,7 +85,7 @@ from .utils import ( lazy_external_import, priority_limit_async_func_call, get_content_summary, - clean_text, + sanitize_text_for_encoding, check_storage_env_vars, generate_track_id, logger, @@ -908,8 +908,8 @@ class LightRAG: update_storage = False try: # Clean input texts - full_text = clean_text(full_text) - text_chunks = [clean_text(chunk) for chunk in text_chunks] + full_text = sanitize_text_for_encoding(full_text) + text_chunks = [sanitize_text_for_encoding(chunk) for chunk in text_chunks] file_path = "" # Process cleaned texts @@ -1020,7 +1020,7 @@ class LightRAG: # Generate contents dict and remove duplicates in one pass unique_contents = {} for id_, doc, path in zip(ids, input, file_paths): - cleaned_content = clean_text(doc) + cleaned_content = sanitize_text_for_encoding(doc) if cleaned_content not in unique_contents: unique_contents[cleaned_content] = (id_, path) @@ -1033,7 +1033,7 @@ class LightRAG: # Clean input text and remove duplicates in one pass unique_content_with_paths = {} for doc, path in zip(input, file_paths): - cleaned_content = clean_text(doc) + cleaned_content = sanitize_text_for_encoding(doc) if cleaned_content not in unique_content_with_paths: unique_content_with_paths[cleaned_content] = path @@ -1817,7 +1817,7 @@ class LightRAG: all_chunks_data: dict[str, dict[str, str]] = {} chunk_to_source_map: dict[str, str] = {} for chunk_data in custom_kg.get("chunks", []): - chunk_content = clean_text(chunk_data["content"]) + chunk_content = sanitize_text_for_encoding(chunk_data["content"]) source_id = chunk_data["source_id"] file_path = chunk_data.get("file_path", "custom_kg") tokens = len(self.tokenizer.encode(chunk_content)) diff --git a/lightrag/utils.py b/lightrag/utils.py index e830d944..8dad9c22 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -1575,18 +1575,6 @@ def normalize_extracted_info(name: str, is_entity=False) -> str: return name -def clean_text(text: str) -> str: - """Clean text by removing null bytes (0x00) and whitespace - - Args: - text: Input text to clean - - Returns: - Cleaned text - """ - return text.strip().replace("\x00", "") - - def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str: """Sanitize text to ensure safe UTF-8 encoding by removing or replacing problematic characters. @@ -1594,6 +1582,7 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str: - Surrogate characters (the main cause of the encoding error) - Other invalid Unicode sequences - Control characters that might cause issues + - Whitespace trimming Args: text: Input text to sanitize @@ -1609,7 +1598,14 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str: return text try: - # First, try to encode/decode to catch any encoding issues early + # First, strip whitespace + text = text.strip() + + # Early return if text is empty after basic cleaning + if not text: + return text + + # Try to encode/decode to catch any encoding issues early text.encode("utf-8") # Remove or replace surrogate characters (U+D800 to U+DFFF) @@ -1630,8 +1626,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str: else: sanitized += char - # Additional cleanup: remove null bytes and other control characters - # that might cause issues (but preserve common whitespace) + # Additional cleanup: remove null bytes and other control characters that might cause issues + # (but preserve common whitespace like \t, \n, \r) sanitized = re.sub( r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", replacement_char, sanitized )