cherry-pick abeaac84

2025-12-04 19:15:03 +08:00 · 2025-12-04 19:15:03 +08:00 · 1368d3a1fe
commit 1368d3a1fe
parent 60b6b6bbae
1 changed files with 53 additions and 103 deletions
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@ -56,9 +56,6 @@ if not logger.handlers:
 # Set httpx logging level to WARNING
 logging.getLogger("httpx").setLevel(logging.WARNING)

-# Precompile regex pattern for JSON sanitization (module-level, compiled once)
-_SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]")
-
 # Global import for pypinyin with startup-time logging
 try:
    import pypinyin
@ -933,120 +930,73 @@ def load_json(file_name):
 def _sanitize_string_for_json(text: str) -> str:
    """Remove characters that cannot be encoded in UTF-8 for JSON serialization.

-    Uses regex for optimal performance with zero-copy optimization for clean strings.
-    Fast detection path for clean strings (99% of cases) with efficient removal for dirty strings.
+    This is a simpler sanitizer specifically for JSON that directly removes
+    problematic characters without attempting to encode first.

    Args:
        text: String to sanitize

    Returns:
-        Original string if clean (zero-copy), sanitized string if dirty
+        Sanitized string safe for UTF-8 encoding in JSON
    """
    if not text:
        return text

-    # Fast path: Check if sanitization is needed using C-level regex search
-    if not _SURROGATE_PATTERN.search(text):
-        return text  # Zero-copy for clean strings - most common case
-
-    # Slow path: Remove problematic characters using C-level regex substitution
-    return _SURROGATE_PATTERN.sub("", text)
-
-
-class SanitizingJSONEncoder(json.JSONEncoder):
-    """
-    Custom JSON encoder that sanitizes data during serialization.
-
-    This encoder cleans strings during the encoding process without creating
-    a full copy of the data structure, making it memory-efficient for large datasets.
-    """
-
-    def encode(self, o):
-        """Override encode method to handle simple string cases"""
-        if isinstance(o, str):
-            return json.encoder.encode_basestring(_sanitize_string_for_json(o))
-        return super().encode(o)
-
-    def iterencode(self, o, _one_shot=False):
-        """
-        Override iterencode to sanitize strings during serialization.
-        This is the core method that handles complex nested structures.
-        """
-        # Preprocess: sanitize all strings in the object
-        sanitized = self._sanitize_for_encoding(o)
-
-        # Call parent's iterencode with sanitized data
-        for chunk in super().iterencode(sanitized, _one_shot):
-            yield chunk
-
-    def _sanitize_for_encoding(self, obj):
-        """
-        Recursively sanitize strings in an object.
-        Creates new objects only when necessary to avoid deep copies.
-
-        Args:
-            obj: Object to sanitize
-
-        Returns:
-            Sanitized object with cleaned strings
-        """
-        if isinstance(obj, str):
-            return _sanitize_string_for_json(obj)
-
-        elif isinstance(obj, dict):
-            # Create new dict with sanitized keys and values
-            new_dict = {}
-            for k, v in obj.items():
-                clean_k = _sanitize_string_for_json(k) if isinstance(k, str) else k
-                clean_v = self._sanitize_for_encoding(v)
-                new_dict[clean_k] = clean_v
-            return new_dict
-
-        elif isinstance(obj, (list, tuple)):
-            # Sanitize list/tuple elements
-            cleaned = [self._sanitize_for_encoding(item) for item in obj]
-            return type(obj)(cleaned) if isinstance(obj, tuple) else cleaned
-
+    # Directly filter out problematic characters without pre-validation
+    sanitized = ""
+    for char in text:
+        code_point = ord(char)
+        # Skip surrogate characters (U+D800 to U+DFFF) - main cause of encoding errors
+        if 0xD800 <= code_point <= 0xDFFF:
+            continue
+        # Skip other non-characters in Unicode
+        elif code_point == 0xFFFE or code_point == 0xFFFF:
+            continue
        else:
-            # Numbers, booleans, None, etc. remain unchanged
-            return obj
+            sanitized += char
+
+    return sanitized
+
+
+def _sanitize_json_data(data: Any) -> Any:
+    """Recursively sanitize all string values in data structure for safe UTF-8 encoding
+
+    Handles all JSON-serializable types including:
+    - Dictionary keys and values
+    - Lists and tuples (preserves type)
+    - Nested structures
+    - Strings at any level
+
+    Args:
+        data: Data to sanitize (dict, list, tuple, str, or other types)
+
+    Returns:
+        Sanitized data with all strings cleaned of problematic characters
+    """
+    if isinstance(data, dict):
+        # Sanitize both keys and values
+        return {
+            _sanitize_string_for_json(k)
+            if isinstance(k, str)
+            else k: _sanitize_json_data(v)
+            for k, v in data.items()
+        }
+    elif isinstance(data, (list, tuple)):
+        # Handle both lists and tuples, preserve original type
+        sanitized = [_sanitize_json_data(item) for item in data]
+        return type(data)(sanitized)
+    elif isinstance(data, str):
+        return _sanitize_string_for_json(data)
+    else:
+        # Numbers, booleans, None, etc. - return as-is
+        return data


 def write_json(json_obj, file_name):
-    """
-    Write JSON data to file with optimized sanitization strategy.
-
-    This function uses a two-stage approach:
-    1. Fast path: Try direct serialization (works for clean data ~99% of time)
-    2. Slow path: Use custom encoder that sanitizes during serialization
-
-    The custom encoder approach avoids creating a deep copy of the data,
-    making it memory-efficient. When sanitization occurs, the caller should
-    reload the cleaned data from the file to update shared memory.
-
-    Args:
-        json_obj: Object to serialize (may be a shallow copy from shared memory)
-        file_name: Output file path
-
-    Returns:
-        bool: True if sanitization was applied (caller should reload data),
-              False if direct write succeeded (no reload needed)
-    """
-    try:
-        # Strategy 1: Fast path - try direct serialization
-        with open(file_name, "w", encoding="utf-8") as f:
-            json.dump(json_obj, f, indent=2, ensure_ascii=False)
-        return False  # No sanitization needed, no reload required
-
-    except (UnicodeEncodeError, UnicodeDecodeError) as e:
-        logger.debug(f"Direct JSON write failed, using sanitizing encoder: {e}")
-
-    # Strategy 2: Use custom encoder (sanitizes during serialization, zero memory copy)
+    # Sanitize data before writing to prevent UTF-8 encoding errors
+    sanitized_obj = _sanitize_json_data(json_obj)
    with open(file_name, "w", encoding="utf-8") as f:
-        json.dump(json_obj, f, indent=2, ensure_ascii=False, cls=SanitizingJSONEncoder)
-
-    logger.info(f"JSON sanitization applied during write: {file_name}")
-    return True  # Sanitization applied, reload recommended
+        json.dump(sanitized_obj, f, indent=2, ensure_ascii=False)


 class TokenizerInterface(Protocol):