From d4bbc5dea9f0774c9662d7d14d9443f96508f8ef Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Sun, 31 Aug 2025 10:36:56 +0800
Subject: [PATCH] refactor: Merge multi-step text sanitization into single
 function

---
 lightrag/operate.py | 49 ++++++++++++---------------------------------
 lightrag/utils.py   | 40 ++++++++++++++++++++----------------
 2 files changed, 36 insertions(+), 53 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index afa8205f..b83790ab 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -11,11 +11,10 @@ from collections import Counter, defaultdict
 
 from .utils import (
     logger,
-    clean_str,
     compute_mdhash_id,
     Tokenizer,
     is_float_regex,
-    normalize_extracted_info,
+    sanitize_and_normalize_extracted_text,
     pack_user_ass_to_openai_messages,
     split_string_by_multi_markers,
     truncate_list_by_token_size,
@@ -31,7 +30,6 @@ from .utils import (
     pick_by_vector_similarity,
     process_chunks_unified,
     build_file_path,
-    sanitize_text_for_encoding,
 )
 from .base import (
     BaseGraphStorage,
@@ -320,14 +318,9 @@ async def _handle_single_entity_extraction(
         return None
 
     try:
-        # Step 1: Strict UTF-8 encoding sanitization (fail-fast approach)
-        entity_name = sanitize_text_for_encoding(record_attributes[1])
-
-        # Step 2: HTML and control character cleaning
-        entity_name = clean_str(entity_name).strip()
-
-        # Step 3: Business logic normalization
-        entity_name = normalize_extracted_info(entity_name, is_entity=True)
+        entity_name = sanitize_and_normalize_extracted_text(
+            record_attributes[1], is_entity=True
+        )
 
         # Validate entity name after all cleaning steps
         if not entity_name or not entity_name.strip():
@@ -337,8 +330,8 @@ async def _handle_single_entity_extraction(
             return None
 
         # Process entity type with same cleaning pipeline
-        entity_type = sanitize_text_for_encoding(record_attributes[2])
-        entity_type = clean_str(entity_type).strip('"')
+        entity_type = sanitize_and_normalize_extracted_text(record_attributes[2])
+
         if not entity_type.strip() or entity_type.startswith('("'):
             logger.warning(
                 f"Entity extraction error: invalid entity type in: {record_attributes}"
@@ -346,9 +339,7 @@ async def _handle_single_entity_extraction(
             return None
 
         # Process entity description with same cleaning pipeline
-        entity_description = sanitize_text_for_encoding(record_attributes[3])
-        entity_description = clean_str(entity_description)
-        entity_description = normalize_extracted_info(entity_description)
+        entity_description = sanitize_and_normalize_extracted_text(record_attributes[3])
 
         if not entity_description.strip():
             logger.warning(
@@ -385,27 +376,17 @@ async def _handle_single_relationship_extraction(
         return None
 
     try:
-        # Process source and target entities with strict cleaning pipeline
-        # Step 1: Strict UTF-8 encoding sanitization (fail-fast approach)
-        source = sanitize_text_for_encoding(record_attributes[1])
-        # Step 2: HTML and control character cleaning
-        source = clean_str(source)
-        # Step 3: Business logic normalization
-        source = normalize_extracted_info(source, is_entity=True)
-
-        # Same pipeline for target entity
-        target = sanitize_text_for_encoding(record_attributes[2])
-        target = clean_str(target)
-        target = normalize_extracted_info(target, is_entity=True)
+        source = sanitize_and_normalize_extracted_text(record_attributes[1])
+        target = sanitize_and_normalize_extracted_text(record_attributes[2])
 
         # Validate entity names after all cleaning steps
-        if not source or not source.strip():
+        if not source:
             logger.warning(
                 f"Relationship extraction error: source entity became empty after cleaning. Original: '{record_attributes[1]}'"
             )
             return None
 
-        if not target or not target.strip():
+        if not target:
             logger.warning(
                 f"Relationship extraction error: target entity became empty after cleaning. Original: '{record_attributes[2]}'"
             )
@@ -418,14 +399,10 @@ async def _handle_single_relationship_extraction(
             return None
 
         # Process relationship description with same cleaning pipeline
-        edge_description = sanitize_text_for_encoding(record_attributes[3])
-        edge_description = clean_str(edge_description)
-        edge_description = normalize_extracted_info(edge_description)
+        edge_description = sanitize_and_normalize_extracted_text(record_attributes[3])
 
         # Process keywords with same cleaning pipeline
-        edge_keywords = sanitize_text_for_encoding(record_attributes[4])
-        edge_keywords = clean_str(edge_keywords)
-        edge_keywords = normalize_extracted_info(edge_keywords, is_entity=True)
+        edge_keywords = sanitize_and_normalize_extracted_text(record_attributes[4])
         edge_keywords = edge_keywords.replace("，", ",")
 
         edge_source_id = chunk_key
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 87ce5b6a..82a7cce4 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -931,19 +931,6 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
     return [r.strip() for r in results if r.strip()]
 
 
-# Refer the utils functions of the official GraphRAG implementation:
-# https://github.com/microsoft/graphrag
-def clean_str(input: Any) -> str:
-    """Clean an input string by removing HTML escapes, control characters, and other unwanted characters."""
-    # If we get non-string input, just give it back
-    if not isinstance(input, str):
-        return input
-
-    result = html.unescape(input.strip())
-    # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python
-    return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result)
-
-
 def is_float_regex(value: str) -> bool:
     return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value))
 
@@ -1728,6 +1715,20 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
     return content[:max_length] + "..."
 
 
+def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str:
+    """Santitize and normalize extracted text
+    Args:
+        input_text: text string to be processed
+        is_name: whether the input text is a entity or relation name
+
+    Returns:
+        Santitized and normalized text string
+    """
+    safe_input_text = sanitize_text_for_encoding(input_text)
+    normalized_text = normalize_extracted_info(safe_input_text, is_name)
+    return normalized_text
+
+
 def normalize_extracted_info(name: str, is_entity=False) -> str:
     """Normalize entity/relation names and description with the following rules:
     1. Remove spaces between Chinese characters
@@ -1789,6 +1790,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
     - Surrogate characters (the main cause of encoding errors)
     - Other invalid Unicode sequences
     - Control characters that might cause issues
+    - Unescape HTML escapes
+    - Remove control characters
     - Whitespace trimming
 
     Args:
@@ -1801,9 +1804,6 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
     Raises:
         ValueError: When text contains uncleanable encoding issues that cannot be safely processed
     """
-    if not isinstance(text, str):
-        return str(text)
-
     if not text:
         return text
 
@@ -1845,7 +1845,13 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
         # Test final encoding to ensure it's safe
         sanitized.encode("utf-8")
 
-        return sanitized
+        # Unescape HTML escapes
+        sanitized = html.unescape(sanitized)
+
+        # Remove control characters
+        sanitized = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", sanitized)
+
+        return sanitized.strip()
 
     except UnicodeEncodeError as e:
         # Critical change: Don't return placeholder, raise exception for caller to handle