Refactor text cleaning to use sanitize_text_for_encoding consistently

• Replace clean_text with sanitize_text
• Remove deprecated clean_text function
• Add whitespace trimming to sanitizer
• Improve UTF-8 encoding safety
• Consolidate text cleaning logic
This commit is contained in:
yangdx 2025-08-19 19:20:01 +08:00
parent f9cf544805
commit 806081645f
2 changed files with 17 additions and 21 deletions

View file

@ -85,7 +85,7 @@ from .utils import (
lazy_external_import, lazy_external_import,
priority_limit_async_func_call, priority_limit_async_func_call,
get_content_summary, get_content_summary,
clean_text, sanitize_text_for_encoding,
check_storage_env_vars, check_storage_env_vars,
generate_track_id, generate_track_id,
logger, logger,
@ -908,8 +908,8 @@ class LightRAG:
update_storage = False update_storage = False
try: try:
# Clean input texts # Clean input texts
full_text = clean_text(full_text) full_text = sanitize_text_for_encoding(full_text)
text_chunks = [clean_text(chunk) for chunk in text_chunks] text_chunks = [sanitize_text_for_encoding(chunk) for chunk in text_chunks]
file_path = "" file_path = ""
# Process cleaned texts # Process cleaned texts
@ -1020,7 +1020,7 @@ class LightRAG:
# Generate contents dict and remove duplicates in one pass # Generate contents dict and remove duplicates in one pass
unique_contents = {} unique_contents = {}
for id_, doc, path in zip(ids, input, file_paths): for id_, doc, path in zip(ids, input, file_paths):
cleaned_content = clean_text(doc) cleaned_content = sanitize_text_for_encoding(doc)
if cleaned_content not in unique_contents: if cleaned_content not in unique_contents:
unique_contents[cleaned_content] = (id_, path) unique_contents[cleaned_content] = (id_, path)
@ -1033,7 +1033,7 @@ class LightRAG:
# Clean input text and remove duplicates in one pass # Clean input text and remove duplicates in one pass
unique_content_with_paths = {} unique_content_with_paths = {}
for doc, path in zip(input, file_paths): for doc, path in zip(input, file_paths):
cleaned_content = clean_text(doc) cleaned_content = sanitize_text_for_encoding(doc)
if cleaned_content not in unique_content_with_paths: if cleaned_content not in unique_content_with_paths:
unique_content_with_paths[cleaned_content] = path unique_content_with_paths[cleaned_content] = path
@ -1817,7 +1817,7 @@ class LightRAG:
all_chunks_data: dict[str, dict[str, str]] = {} all_chunks_data: dict[str, dict[str, str]] = {}
chunk_to_source_map: dict[str, str] = {} chunk_to_source_map: dict[str, str] = {}
for chunk_data in custom_kg.get("chunks", []): for chunk_data in custom_kg.get("chunks", []):
chunk_content = clean_text(chunk_data["content"]) chunk_content = sanitize_text_for_encoding(chunk_data["content"])
source_id = chunk_data["source_id"] source_id = chunk_data["source_id"]
file_path = chunk_data.get("file_path", "custom_kg") file_path = chunk_data.get("file_path", "custom_kg")
tokens = len(self.tokenizer.encode(chunk_content)) tokens = len(self.tokenizer.encode(chunk_content))

View file

@ -1575,18 +1575,6 @@ def normalize_extracted_info(name: str, is_entity=False) -> str:
return name return name
def clean_text(text: str) -> str:
"""Clean text by removing null bytes (0x00) and whitespace
Args:
text: Input text to clean
Returns:
Cleaned text
"""
return text.strip().replace("\x00", "")
def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str: def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
"""Sanitize text to ensure safe UTF-8 encoding by removing or replacing problematic characters. """Sanitize text to ensure safe UTF-8 encoding by removing or replacing problematic characters.
@ -1594,6 +1582,7 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
- Surrogate characters (the main cause of the encoding error) - Surrogate characters (the main cause of the encoding error)
- Other invalid Unicode sequences - Other invalid Unicode sequences
- Control characters that might cause issues - Control characters that might cause issues
- Whitespace trimming
Args: Args:
text: Input text to sanitize text: Input text to sanitize
@ -1609,7 +1598,14 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
return text return text
try: try:
# First, try to encode/decode to catch any encoding issues early # First, strip whitespace
text = text.strip()
# Early return if text is empty after basic cleaning
if not text:
return text
# Try to encode/decode to catch any encoding issues early
text.encode("utf-8") text.encode("utf-8")
# Remove or replace surrogate characters (U+D800 to U+DFFF) # Remove or replace surrogate characters (U+D800 to U+DFFF)
@ -1630,8 +1626,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
else: else:
sanitized += char sanitized += char
# Additional cleanup: remove null bytes and other control characters # Additional cleanup: remove null bytes and other control characters that might cause issues
# that might cause issues (but preserve common whitespace) # (but preserve common whitespace like \t, \n, \r)
sanitized = re.sub( sanitized = re.sub(
r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", replacement_char, sanitized r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", replacement_char, sanitized
) )