Optimize JSON string sanitization with precompiled regex and zero-copy

- Precompile regex pattern at module level - Zero-copy path for clean strings - Use C-level regex for performance - Remove deprecated _sanitize_json_data - Fast detection for common case
2025-11-12 15:42:07 +08:00 · 2025-11-12 15:42:07 +08:00 · 7f54f47093
commit 7f54f47093
parent f289cf6225
1 changed files with 11 additions and 54 deletions
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@ -56,6 +56,9 @@ if not logger.handlers:
 # Set httpx logging level to WARNING
 logging.getLogger("httpx").setLevel(logging.WARNING)

+# Precompile regex pattern for JSON sanitization (module-level, compiled once)
+_SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]")
+
 # Global import for pypinyin with startup-time logging
 try:
    import pypinyin
@ -930,70 +933,24 @@ def load_json(file_name):
 def _sanitize_string_for_json(text: str) -> str:
    """Remove characters that cannot be encoded in UTF-8 for JSON serialization.

-    This is a simpler sanitizer specifically for JSON that directly removes
-    problematic characters without attempting to encode first.
+    Uses regex for optimal performance with zero-copy optimization for clean strings.
+    Fast detection path for clean strings (99% of cases) with efficient removal for dirty strings.

    Args:
        text: String to sanitize

    Returns:
-        Sanitized string safe for UTF-8 encoding in JSON
+        Original string if clean (zero-copy), sanitized string if dirty
    """
    if not text:
        return text

-    # Directly filter out problematic characters without pre-validation
-    sanitized = ""
-    for char in text:
-        code_point = ord(char)
-        # Skip surrogate characters (U+D800 to U+DFFF) - main cause of encoding errors
-        if 0xD800 <= code_point <= 0xDFFF:
-            continue
-        # Skip other non-characters in Unicode
-        elif code_point == 0xFFFE or code_point == 0xFFFF:
-            continue
-        else:
-            sanitized += char
+    # Fast path: Check if sanitization is needed using C-level regex search
+    if not _SURROGATE_PATTERN.search(text):
+        return text  # Zero-copy for clean strings - most common case

-    return sanitized
-
-
-def _sanitize_json_data(data: Any) -> Any:
-    """Recursively sanitize all string values in data structure for safe UTF-8 encoding
-
-    DEPRECATED: This function creates a deep copy of the data which can be memory-intensive.
-    For new code, prefer using write_json with SanitizingJSONEncoder which sanitizes during
-    serialization without creating copies.
-
-    Handles all JSON-serializable types including:
-    - Dictionary keys and values
-    - Lists and tuples (preserves type)
-    - Nested structures
-    - Strings at any level
-
-    Args:
-        data: Data to sanitize (dict, list, tuple, str, or other types)
-
-    Returns:
-        Sanitized data with all strings cleaned of problematic characters
-    """
-    if isinstance(data, dict):
-        # Sanitize both keys and values
-        return {
-            _sanitize_string_for_json(k)
-            if isinstance(k, str)
-            else k: _sanitize_json_data(v)
-            for k, v in data.items()
-        }
-    elif isinstance(data, (list, tuple)):
-        # Handle both lists and tuples, preserve original type
-        sanitized = [_sanitize_json_data(item) for item in data]
-        return type(data)(sanitized)
-    elif isinstance(data, str):
-        return _sanitize_string_for_json(data)
-    else:
-        # Numbers, booleans, None, etc. - return as-is
-        return data
+    # Slow path: Remove problematic characters using C-level regex substitution
+    return _SURROGATE_PATTERN.sub("", text)


 class SanitizingJSONEncoder(json.JSONEncoder):